diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 23 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 4 | ||||
-rw-r--r-- | mm/cleancache.c | 244 | ||||
-rw-r--r-- | mm/filemap.c | 83 | ||||
-rw-r--r-- | mm/filemap_xip.c | 4 | ||||
-rw-r--r-- | mm/fremap.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 25 | ||||
-rw-r--r-- | mm/hugetlb.c | 20 | ||||
-rw-r--r-- | mm/init-mm.c | 1 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kmemleak.c | 7 | ||||
-rw-r--r-- | mm/ksm.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 377 | ||||
-rw-r--r-- | mm/memory-failure.c | 21 | ||||
-rw-r--r-- | mm/memory.c | 444 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 21 | ||||
-rw-r--r-- | mm/mempolicy.c | 164 | ||||
-rw-r--r-- | mm/migrate.c | 17 | ||||
-rw-r--r-- | mm/mlock.c | 8 | ||||
-rw-r--r-- | mm/mmap.c | 129 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/nobootmem.c | 23 | ||||
-rw-r--r-- | mm/nommu.c | 108 | ||||
-rw-r--r-- | mm/oom_kill.c | 36 | ||||
-rw-r--r-- | mm/page_alloc.c | 128 | ||||
-rw-r--r-- | mm/page_cgroup.c | 28 | ||||
-rw-r--r-- | mm/percpu.c | 6 | ||||
-rw-r--r-- | mm/prio_tree.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 183 | ||||
-rw-r--r-- | mm/shmem.c | 334 | ||||
-rw-r--r-- | mm/slab.c | 1 | ||||
-rw-r--r-- | mm/slub.c | 169 | ||||
-rw-r--r-- | mm/swap.c | 52 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/util.c | 24 | ||||
-rw-r--r-- | mm/vmalloc.c | 15 | ||||
-rw-r--r-- | mm/vmscan.c | 185 | ||||
-rw-r--r-- | mm/vmstat.c | 264 |
41 files changed, 2057 insertions, 1129 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e9c0c61f2ddd..8ca47a5ee9c8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM | |||
347 | depends on !SMP | 347 | depends on !SMP |
348 | bool | 348 | bool |
349 | default y | 349 | default y |
350 | |||
351 | config CLEANCACHE | ||
352 | bool "Enable cleancache driver to cache clean pages if tmem is present" | ||
353 | default n | ||
354 | help | ||
355 | Cleancache can be thought of as a page-granularity victim cache | ||
356 | for clean pages that the kernel's pageframe replacement algorithm | ||
357 | (PFRA) would like to keep around, but can't since there isn't enough | ||
358 | memory. So when the PFRA "evicts" a page, it first attempts to use | ||
359 | cleancacne code to put the data contained in that page into | ||
360 | "transcendent memory", memory that is not directly accessible or | ||
361 | addressable by the kernel and is of unknown and possibly | ||
362 | time-varying size. And when a cleancache-enabled | ||
363 | filesystem wishes to access a page in a file on disk, it first | ||
364 | checks cleancache to see if it already contains it; if it does, | ||
365 | the page is copied into the kernel and a disk access is avoided. | ||
366 | When a transcendent memory driver is available (such as zcache or | ||
367 | Xen transcendent memory), a significant I/O reduction | ||
368 | may be achieved. When none is available, all cleancache calls | ||
369 | are reduced to a single pointer-compare-against-NULL resulting | ||
370 | in a negligible performance hit. | ||
371 | |||
372 | If unsure, say Y to enable cleancache | ||
diff --git a/mm/Makefile b/mm/Makefile index 42a8326c3e3d..836e4163c1bf 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | |||
49 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 49 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
50 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 50 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
51 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 51 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
52 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index befc87531e4f..f032e6e1e09a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -63,10 +63,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
63 | unsigned long background_thresh; | 63 | unsigned long background_thresh; |
64 | unsigned long dirty_thresh; | 64 | unsigned long dirty_thresh; |
65 | unsigned long bdi_thresh; | 65 | unsigned long bdi_thresh; |
66 | unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; | 66 | unsigned long nr_dirty, nr_io, nr_more_io; |
67 | struct inode *inode; | 67 | struct inode *inode; |
68 | 68 | ||
69 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; | 69 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 70 | spin_lock(&inode_wb_list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 72 | nr_dirty++; |
diff --git a/mm/cleancache.c b/mm/cleancache.c new file mode 100644 index 000000000000..bcaae4c2a770 --- /dev/null +++ b/mm/cleancache.c | |||
@@ -0,0 +1,244 @@ | |||
1 | /* | ||
2 | * Cleancache frontend | ||
3 | * | ||
4 | * This code provides the generic "frontend" layer to call a matching | ||
5 | * "backend" driver implementation of cleancache. See | ||
6 | * Documentation/vm/cleancache.txt for more information. | ||
7 | * | ||
8 | * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. | ||
9 | * Author: Dan Magenheimer | ||
10 | * | ||
11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
12 | */ | ||
13 | |||
14 | #include <linux/module.h> | ||
15 | #include <linux/fs.h> | ||
16 | #include <linux/exportfs.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/cleancache.h> | ||
19 | |||
20 | /* | ||
21 | * This global enablement flag may be read thousands of times per second | ||
22 | * by cleancache_get/put/flush even on systems where cleancache_ops | ||
23 | * is not claimed (e.g. cleancache is config'ed on but remains | ||
24 | * disabled), so is preferred to the slower alternative: a function | ||
25 | * call that checks a non-global. | ||
26 | */ | ||
27 | int cleancache_enabled; | ||
28 | EXPORT_SYMBOL(cleancache_enabled); | ||
29 | |||
30 | /* | ||
31 | * cleancache_ops is set by cleancache_ops_register to contain the pointers | ||
32 | * to the cleancache "backend" implementation functions. | ||
33 | */ | ||
34 | static struct cleancache_ops cleancache_ops; | ||
35 | |||
36 | /* useful stats available in /sys/kernel/mm/cleancache */ | ||
37 | static unsigned long cleancache_succ_gets; | ||
38 | static unsigned long cleancache_failed_gets; | ||
39 | static unsigned long cleancache_puts; | ||
40 | static unsigned long cleancache_flushes; | ||
41 | |||
42 | /* | ||
43 | * register operations for cleancache, returning previous thus allowing | ||
44 | * detection of multiple backends and possible nesting | ||
45 | */ | ||
46 | struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) | ||
47 | { | ||
48 | struct cleancache_ops old = cleancache_ops; | ||
49 | |||
50 | cleancache_ops = *ops; | ||
51 | cleancache_enabled = 1; | ||
52 | return old; | ||
53 | } | ||
54 | EXPORT_SYMBOL(cleancache_register_ops); | ||
55 | |||
56 | /* Called by a cleancache-enabled filesystem at time of mount */ | ||
57 | void __cleancache_init_fs(struct super_block *sb) | ||
58 | { | ||
59 | sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); | ||
60 | } | ||
61 | EXPORT_SYMBOL(__cleancache_init_fs); | ||
62 | |||
63 | /* Called by a cleancache-enabled clustered filesystem at time of mount */ | ||
64 | void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) | ||
65 | { | ||
66 | sb->cleancache_poolid = | ||
67 | (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); | ||
68 | } | ||
69 | EXPORT_SYMBOL(__cleancache_init_shared_fs); | ||
70 | |||
71 | /* | ||
72 | * If the filesystem uses exportable filehandles, use the filehandle as | ||
73 | * the key, else use the inode number. | ||
74 | */ | ||
75 | static int cleancache_get_key(struct inode *inode, | ||
76 | struct cleancache_filekey *key) | ||
77 | { | ||
78 | int (*fhfn)(struct dentry *, __u32 *fh, int *, int); | ||
79 | int len = 0, maxlen = CLEANCACHE_KEY_MAX; | ||
80 | struct super_block *sb = inode->i_sb; | ||
81 | |||
82 | key->u.ino = inode->i_ino; | ||
83 | if (sb->s_export_op != NULL) { | ||
84 | fhfn = sb->s_export_op->encode_fh; | ||
85 | if (fhfn) { | ||
86 | struct dentry d; | ||
87 | d.d_inode = inode; | ||
88 | len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); | ||
89 | if (len <= 0 || len == 255) | ||
90 | return -1; | ||
91 | if (maxlen > CLEANCACHE_KEY_MAX) | ||
92 | return -1; | ||
93 | } | ||
94 | } | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | /* | ||
99 | * "Get" data from cleancache associated with the poolid/inode/index | ||
100 | * that were specified when the data was put to cleanache and, if | ||
101 | * successful, use it to fill the specified page with data and return 0. | ||
102 | * The pageframe is unchanged and returns -1 if the get fails. | ||
103 | * Page must be locked by caller. | ||
104 | */ | ||
105 | int __cleancache_get_page(struct page *page) | ||
106 | { | ||
107 | int ret = -1; | ||
108 | int pool_id; | ||
109 | struct cleancache_filekey key = { .u.key = { 0 } }; | ||
110 | |||
111 | VM_BUG_ON(!PageLocked(page)); | ||
112 | pool_id = page->mapping->host->i_sb->cleancache_poolid; | ||
113 | if (pool_id < 0) | ||
114 | goto out; | ||
115 | |||
116 | if (cleancache_get_key(page->mapping->host, &key) < 0) | ||
117 | goto out; | ||
118 | |||
119 | ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); | ||
120 | if (ret == 0) | ||
121 | cleancache_succ_gets++; | ||
122 | else | ||
123 | cleancache_failed_gets++; | ||
124 | out: | ||
125 | return ret; | ||
126 | } | ||
127 | EXPORT_SYMBOL(__cleancache_get_page); | ||
128 | |||
129 | /* | ||
130 | * "Put" data from a page to cleancache and associate it with the | ||
131 | * (previously-obtained per-filesystem) poolid and the page's, | ||
132 | * inode and page index. Page must be locked. Note that a put_page | ||
133 | * always "succeeds", though a subsequent get_page may succeed or fail. | ||
134 | */ | ||
135 | void __cleancache_put_page(struct page *page) | ||
136 | { | ||
137 | int pool_id; | ||
138 | struct cleancache_filekey key = { .u.key = { 0 } }; | ||
139 | |||
140 | VM_BUG_ON(!PageLocked(page)); | ||
141 | pool_id = page->mapping->host->i_sb->cleancache_poolid; | ||
142 | if (pool_id >= 0 && | ||
143 | cleancache_get_key(page->mapping->host, &key) >= 0) { | ||
144 | (*cleancache_ops.put_page)(pool_id, key, page->index, page); | ||
145 | cleancache_puts++; | ||
146 | } | ||
147 | } | ||
148 | EXPORT_SYMBOL(__cleancache_put_page); | ||
149 | |||
150 | /* | ||
151 | * Flush any data from cleancache associated with the poolid and the | ||
152 | * page's inode and page index so that a subsequent "get" will fail. | ||
153 | */ | ||
154 | void __cleancache_flush_page(struct address_space *mapping, struct page *page) | ||
155 | { | ||
156 | /* careful... page->mapping is NULL sometimes when this is called */ | ||
157 | int pool_id = mapping->host->i_sb->cleancache_poolid; | ||
158 | struct cleancache_filekey key = { .u.key = { 0 } }; | ||
159 | |||
160 | if (pool_id >= 0) { | ||
161 | VM_BUG_ON(!PageLocked(page)); | ||
162 | if (cleancache_get_key(mapping->host, &key) >= 0) { | ||
163 | (*cleancache_ops.flush_page)(pool_id, key, page->index); | ||
164 | cleancache_flushes++; | ||
165 | } | ||
166 | } | ||
167 | } | ||
168 | EXPORT_SYMBOL(__cleancache_flush_page); | ||
169 | |||
170 | /* | ||
171 | * Flush all data from cleancache associated with the poolid and the | ||
172 | * mappings's inode so that all subsequent gets to this poolid/inode | ||
173 | * will fail. | ||
174 | */ | ||
175 | void __cleancache_flush_inode(struct address_space *mapping) | ||
176 | { | ||
177 | int pool_id = mapping->host->i_sb->cleancache_poolid; | ||
178 | struct cleancache_filekey key = { .u.key = { 0 } }; | ||
179 | |||
180 | if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) | ||
181 | (*cleancache_ops.flush_inode)(pool_id, key); | ||
182 | } | ||
183 | EXPORT_SYMBOL(__cleancache_flush_inode); | ||
184 | |||
185 | /* | ||
186 | * Called by any cleancache-enabled filesystem at time of unmount; | ||
187 | * note that pool_id is surrendered and may be reutrned by a subsequent | ||
188 | * cleancache_init_fs or cleancache_init_shared_fs | ||
189 | */ | ||
190 | void __cleancache_flush_fs(struct super_block *sb) | ||
191 | { | ||
192 | if (sb->cleancache_poolid >= 0) { | ||
193 | int old_poolid = sb->cleancache_poolid; | ||
194 | sb->cleancache_poolid = -1; | ||
195 | (*cleancache_ops.flush_fs)(old_poolid); | ||
196 | } | ||
197 | } | ||
198 | EXPORT_SYMBOL(__cleancache_flush_fs); | ||
199 | |||
200 | #ifdef CONFIG_SYSFS | ||
201 | |||
202 | /* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ | ||
203 | |||
204 | #define CLEANCACHE_SYSFS_RO(_name) \ | ||
205 | static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ | ||
206 | struct kobj_attribute *attr, char *buf) \ | ||
207 | { \ | ||
208 | return sprintf(buf, "%lu\n", cleancache_##_name); \ | ||
209 | } \ | ||
210 | static struct kobj_attribute cleancache_##_name##_attr = { \ | ||
211 | .attr = { .name = __stringify(_name), .mode = 0444 }, \ | ||
212 | .show = cleancache_##_name##_show, \ | ||
213 | } | ||
214 | |||
215 | CLEANCACHE_SYSFS_RO(succ_gets); | ||
216 | CLEANCACHE_SYSFS_RO(failed_gets); | ||
217 | CLEANCACHE_SYSFS_RO(puts); | ||
218 | CLEANCACHE_SYSFS_RO(flushes); | ||
219 | |||
220 | static struct attribute *cleancache_attrs[] = { | ||
221 | &cleancache_succ_gets_attr.attr, | ||
222 | &cleancache_failed_gets_attr.attr, | ||
223 | &cleancache_puts_attr.attr, | ||
224 | &cleancache_flushes_attr.attr, | ||
225 | NULL, | ||
226 | }; | ||
227 | |||
228 | static struct attribute_group cleancache_attr_group = { | ||
229 | .attrs = cleancache_attrs, | ||
230 | .name = "cleancache", | ||
231 | }; | ||
232 | |||
233 | #endif /* CONFIG_SYSFS */ | ||
234 | |||
235 | static int __init init_cleancache(void) | ||
236 | { | ||
237 | #ifdef CONFIG_SYSFS | ||
238 | int err; | ||
239 | |||
240 | err = sysfs_create_group(mm_kobj, &cleancache_attr_group); | ||
241 | #endif /* CONFIG_SYSFS */ | ||
242 | return 0; | ||
243 | } | ||
244 | module_init(init_cleancache) | ||
diff --git a/mm/filemap.c b/mm/filemap.c index c641edf553a9..bcdc393b6580 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | 36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ |
37 | #include <linux/cleancache.h> | ||
37 | #include "internal.h" | 38 | #include "internal.h" |
38 | 39 | ||
39 | /* | 40 | /* |
@@ -58,16 +59,16 @@ | |||
58 | /* | 59 | /* |
59 | * Lock ordering: | 60 | * Lock ordering: |
60 | * | 61 | * |
61 | * ->i_mmap_lock (truncate_pagecache) | 62 | * ->i_mmap_mutex (truncate_pagecache) |
62 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
63 | * ->swap_lock (exclusive_swap_page, others) | 64 | * ->swap_lock (exclusive_swap_page, others) |
64 | * ->mapping->tree_lock | 65 | * ->mapping->tree_lock |
65 | * | 66 | * |
66 | * ->i_mutex | 67 | * ->i_mutex |
67 | * ->i_mmap_lock (truncate->unmap_mapping_range) | 68 | * ->i_mmap_mutex (truncate->unmap_mapping_range) |
68 | * | 69 | * |
69 | * ->mmap_sem | 70 | * ->mmap_sem |
70 | * ->i_mmap_lock | 71 | * ->i_mmap_mutex |
71 | * ->page_table_lock or pte_lock (various, mainly in memory.c) | 72 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
72 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 73 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
73 | * | 74 | * |
@@ -84,7 +85,7 @@ | |||
84 | * sb_lock (fs/fs-writeback.c) | 85 | * sb_lock (fs/fs-writeback.c) |
85 | * ->mapping->tree_lock (__sync_single_inode) | 86 | * ->mapping->tree_lock (__sync_single_inode) |
86 | * | 87 | * |
87 | * ->i_mmap_lock | 88 | * ->i_mmap_mutex |
88 | * ->anon_vma.lock (vma_adjust) | 89 | * ->anon_vma.lock (vma_adjust) |
89 | * | 90 | * |
90 | * ->anon_vma.lock | 91 | * ->anon_vma.lock |
@@ -106,7 +107,7 @@ | |||
106 | * | 107 | * |
107 | * (code doesn't rely on that order, so you could switch it around) | 108 | * (code doesn't rely on that order, so you could switch it around) |
108 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
109 | * ->i_mmap_lock | 110 | * ->i_mmap_mutex |
110 | */ | 111 | */ |
111 | 112 | ||
112 | /* | 113 | /* |
@@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page) | |||
118 | { | 119 | { |
119 | struct address_space *mapping = page->mapping; | 120 | struct address_space *mapping = page->mapping; |
120 | 121 | ||
122 | /* | ||
123 | * if we're uptodate, flush out into the cleancache, otherwise | ||
124 | * invalidate any existing cleancache entries. We can't leave | ||
125 | * stale data around in the cleancache once our page is gone | ||
126 | */ | ||
127 | if (PageUptodate(page) && PageMappedToDisk(page)) | ||
128 | cleancache_put_page(page); | ||
129 | else | ||
130 | cleancache_flush_page(mapping, page); | ||
131 | |||
121 | radix_tree_delete(&mapping->page_tree, page->index); | 132 | radix_tree_delete(&mapping->page_tree, page->index); |
122 | page->mapping = NULL; | 133 | page->mapping = NULL; |
123 | mapping->nrpages--; | 134 | mapping->nrpages--; |
@@ -562,6 +573,17 @@ void wait_on_page_bit(struct page *page, int bit_nr) | |||
562 | } | 573 | } |
563 | EXPORT_SYMBOL(wait_on_page_bit); | 574 | EXPORT_SYMBOL(wait_on_page_bit); |
564 | 575 | ||
576 | int wait_on_page_bit_killable(struct page *page, int bit_nr) | ||
577 | { | ||
578 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | ||
579 | |||
580 | if (!test_bit(bit_nr, &page->flags)) | ||
581 | return 0; | ||
582 | |||
583 | return __wait_on_bit(page_waitqueue(page), &wait, | ||
584 | sleep_on_page_killable, TASK_KILLABLE); | ||
585 | } | ||
586 | |||
565 | /** | 587 | /** |
566 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue | 588 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue |
567 | * @page: Page defining the wait queue of interest | 589 | * @page: Page defining the wait queue of interest |
@@ -643,15 +665,32 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); | |||
643 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | 665 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
644 | unsigned int flags) | 666 | unsigned int flags) |
645 | { | 667 | { |
646 | if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { | 668 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
647 | __lock_page(page); | 669 | /* |
648 | return 1; | 670 | * CAUTION! In this case, mmap_sem is not released |
649 | } else { | 671 | * even though return 0. |
650 | if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { | 672 | */ |
651 | up_read(&mm->mmap_sem); | 673 | if (flags & FAULT_FLAG_RETRY_NOWAIT) |
674 | return 0; | ||
675 | |||
676 | up_read(&mm->mmap_sem); | ||
677 | if (flags & FAULT_FLAG_KILLABLE) | ||
678 | wait_on_page_locked_killable(page); | ||
679 | else | ||
652 | wait_on_page_locked(page); | 680 | wait_on_page_locked(page); |
653 | } | ||
654 | return 0; | 681 | return 0; |
682 | } else { | ||
683 | if (flags & FAULT_FLAG_KILLABLE) { | ||
684 | int ret; | ||
685 | |||
686 | ret = __lock_page_killable(page); | ||
687 | if (ret) { | ||
688 | up_read(&mm->mmap_sem); | ||
689 | return 0; | ||
690 | } | ||
691 | } else | ||
692 | __lock_page(page); | ||
693 | return 1; | ||
655 | } | 694 | } |
656 | } | 695 | } |
657 | 696 | ||
@@ -1528,15 +1567,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, | |||
1528 | /* If we don't want any read-ahead, don't bother */ | 1567 | /* If we don't want any read-ahead, don't bother */ |
1529 | if (VM_RandomReadHint(vma)) | 1568 | if (VM_RandomReadHint(vma)) |
1530 | return; | 1569 | return; |
1570 | if (!ra->ra_pages) | ||
1571 | return; | ||
1531 | 1572 | ||
1532 | if (VM_SequentialReadHint(vma) || | 1573 | if (VM_SequentialReadHint(vma)) { |
1533 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
1534 | page_cache_sync_readahead(mapping, ra, file, offset, | 1574 | page_cache_sync_readahead(mapping, ra, file, offset, |
1535 | ra->ra_pages); | 1575 | ra->ra_pages); |
1536 | return; | 1576 | return; |
1537 | } | 1577 | } |
1538 | 1578 | ||
1539 | if (ra->mmap_miss < INT_MAX) | 1579 | /* Avoid banging the cache line if not needed */ |
1580 | if (ra->mmap_miss < MMAP_LOTSAMISS * 10) | ||
1540 | ra->mmap_miss++; | 1581 | ra->mmap_miss++; |
1541 | 1582 | ||
1542 | /* | 1583 | /* |
@@ -1550,12 +1591,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, | |||
1550 | * mmap read-around | 1591 | * mmap read-around |
1551 | */ | 1592 | */ |
1552 | ra_pages = max_sane_readahead(ra->ra_pages); | 1593 | ra_pages = max_sane_readahead(ra->ra_pages); |
1553 | if (ra_pages) { | 1594 | ra->start = max_t(long, 0, offset - ra_pages / 2); |
1554 | ra->start = max_t(long, 0, offset - ra_pages/2); | 1595 | ra->size = ra_pages; |
1555 | ra->size = ra_pages; | 1596 | ra->async_size = ra_pages / 4; |
1556 | ra->async_size = 0; | 1597 | ra_submit(ra, mapping, file); |
1557 | ra_submit(ra, mapping, file); | ||
1558 | } | ||
1559 | } | 1598 | } |
1560 | 1599 | ||
1561 | /* | 1600 | /* |
@@ -1622,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1622 | /* No page in the page cache at all */ | 1661 | /* No page in the page cache at all */ |
1623 | do_sync_mmap_readahead(vma, ra, file, offset); | 1662 | do_sync_mmap_readahead(vma, ra, file, offset); |
1624 | count_vm_event(PGMAJFAULT); | 1663 | count_vm_event(PGMAJFAULT); |
1664 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | ||
1625 | ret = VM_FAULT_MAJOR; | 1665 | ret = VM_FAULT_MAJOR; |
1626 | retry_find: | 1666 | retry_find: |
1627 | page = find_get_page(mapping, offset); | 1667 | page = find_get_page(mapping, offset); |
@@ -1660,7 +1700,6 @@ retry_find: | |||
1660 | return VM_FAULT_SIGBUS; | 1700 | return VM_FAULT_SIGBUS; |
1661 | } | 1701 | } |
1662 | 1702 | ||
1663 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; | ||
1664 | vmf->page = page; | 1703 | vmf->page = page; |
1665 | return ret | VM_FAULT_LOCKED; | 1704 | return ret | VM_FAULT_LOCKED; |
1666 | 1705 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 83364df74a33..93356cd12828 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping, | |||
183 | return; | 183 | return; |
184 | 184 | ||
185 | retry: | 185 | retry: |
186 | spin_lock(&mapping->i_mmap_lock); | 186 | mutex_lock(&mapping->i_mmap_mutex); |
187 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 187 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
188 | mm = vma->vm_mm; | 188 | mm = vma->vm_mm; |
189 | address = vma->vm_start + | 189 | address = vma->vm_start + |
@@ -201,7 +201,7 @@ retry: | |||
201 | page_cache_release(page); | 201 | page_cache_release(page); |
202 | } | 202 | } |
203 | } | 203 | } |
204 | spin_unlock(&mapping->i_mmap_lock); | 204 | mutex_unlock(&mapping->i_mmap_mutex); |
205 | 205 | ||
206 | if (locked) { | 206 | if (locked) { |
207 | mutex_unlock(&xip_sparse_mutex); | 207 | mutex_unlock(&xip_sparse_mutex); |
diff --git a/mm/fremap.c b/mm/fremap.c index ec520c7b28df..b8e0e2d468af 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
211 | } | 211 | } |
212 | goto out; | 212 | goto out; |
213 | } | 213 | } |
214 | spin_lock(&mapping->i_mmap_lock); | 214 | mutex_lock(&mapping->i_mmap_mutex); |
215 | flush_dcache_mmap_lock(mapping); | 215 | flush_dcache_mmap_lock(mapping); |
216 | vma->vm_flags |= VM_NONLINEAR; | 216 | vma->vm_flags |= VM_NONLINEAR; |
217 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 217 | vma_prio_tree_remove(vma, &mapping->i_mmap); |
218 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 218 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
219 | flush_dcache_mmap_unlock(mapping); | 219 | flush_dcache_mmap_unlock(mapping); |
220 | spin_unlock(&mapping->i_mmap_lock); | 220 | mutex_unlock(&mapping->i_mmap_mutex); |
221 | } | 221 | } |
222 | 222 | ||
223 | if (vma->vm_flags & VM_LOCKED) { | 223 | if (vma->vm_flags & VM_LOCKED) { |
224 | /* | 224 | /* |
225 | * drop PG_Mlocked flag for over-mapped range | 225 | * drop PG_Mlocked flag for over-mapped range |
226 | */ | 226 | */ |
227 | unsigned int saved_flags = vma->vm_flags; | 227 | vm_flags_t saved_flags = vma->vm_flags; |
228 | munlock_vma_pages_range(vma, start, start + size); | 228 | munlock_vma_pages_range(vma, start, start + size); |
229 | vma->vm_flags = saved_flags; | 229 | vma->vm_flags = saved_flags; |
230 | } | 230 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 83326ad66d9b..615d9743a3cb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1139,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page, | |||
1139 | * We can't temporarily set the pmd to null in order | 1139 | * We can't temporarily set the pmd to null in order |
1140 | * to split it, the pmd must remain marked huge at all | 1140 | * to split it, the pmd must remain marked huge at all |
1141 | * times or the VM won't take the pmd_trans_huge paths | 1141 | * times or the VM won't take the pmd_trans_huge paths |
1142 | * and it won't wait on the anon_vma->root->lock to | 1142 | * and it won't wait on the anon_vma->root->mutex to |
1143 | * serialize against split_huge_page*. | 1143 | * serialize against split_huge_page*. |
1144 | */ | 1144 | */ |
1145 | pmdp_splitting_flush_notify(vma, address, pmd); | 1145 | pmdp_splitting_flush_notify(vma, address, pmd); |
@@ -1333,7 +1333,7 @@ static int __split_huge_page_map(struct page *page, | |||
1333 | return ret; | 1333 | return ret; |
1334 | } | 1334 | } |
1335 | 1335 | ||
1336 | /* must be called with anon_vma->root->lock hold */ | 1336 | /* must be called with anon_vma->root->mutex hold */ |
1337 | static void __split_huge_page(struct page *page, | 1337 | static void __split_huge_page(struct page *page, |
1338 | struct anon_vma *anon_vma) | 1338 | struct anon_vma *anon_vma) |
1339 | { | 1339 | { |
@@ -1771,12 +1771,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1771 | 1771 | ||
1772 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1772 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
1773 | #ifndef CONFIG_NUMA | 1773 | #ifndef CONFIG_NUMA |
1774 | up_read(&mm->mmap_sem); | ||
1774 | VM_BUG_ON(!*hpage); | 1775 | VM_BUG_ON(!*hpage); |
1775 | new_page = *hpage; | 1776 | new_page = *hpage; |
1776 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | ||
1777 | up_read(&mm->mmap_sem); | ||
1778 | return; | ||
1779 | } | ||
1780 | #else | 1777 | #else |
1781 | VM_BUG_ON(*hpage); | 1778 | VM_BUG_ON(*hpage); |
1782 | /* | 1779 | /* |
@@ -1791,22 +1788,26 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1791 | */ | 1788 | */ |
1792 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 1789 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, |
1793 | node, __GFP_OTHER_NODE); | 1790 | node, __GFP_OTHER_NODE); |
1791 | |||
1792 | /* | ||
1793 | * After allocating the hugepage, release the mmap_sem read lock in | ||
1794 | * preparation for taking it in write mode. | ||
1795 | */ | ||
1796 | up_read(&mm->mmap_sem); | ||
1794 | if (unlikely(!new_page)) { | 1797 | if (unlikely(!new_page)) { |
1795 | up_read(&mm->mmap_sem); | ||
1796 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 1798 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
1797 | *hpage = ERR_PTR(-ENOMEM); | 1799 | *hpage = ERR_PTR(-ENOMEM); |
1798 | return; | 1800 | return; |
1799 | } | 1801 | } |
1802 | #endif | ||
1803 | |||
1800 | count_vm_event(THP_COLLAPSE_ALLOC); | 1804 | count_vm_event(THP_COLLAPSE_ALLOC); |
1801 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1805 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
1802 | up_read(&mm->mmap_sem); | 1806 | #ifdef CONFIG_NUMA |
1803 | put_page(new_page); | 1807 | put_page(new_page); |
1808 | #endif | ||
1804 | return; | 1809 | return; |
1805 | } | 1810 | } |
1806 | #endif | ||
1807 | |||
1808 | /* after allocating the hugepage upgrade to mmap_sem write mode */ | ||
1809 | up_read(&mm->mmap_sem); | ||
1810 | 1811 | ||
1811 | /* | 1812 | /* |
1812 | * Prevent all access to pagetables with the exception of | 1813 | * Prevent all access to pagetables with the exception of |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8ee3bd8ec5b5..f33bb319b73f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -475,7 +475,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
475 | 475 | ||
476 | /* If reserves cannot be used, ensure enough pages are in the pool */ | 476 | /* If reserves cannot be used, ensure enough pages are in the pool */ |
477 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) | 477 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) |
478 | goto err;; | 478 | goto err; |
479 | 479 | ||
480 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 480 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
481 | MAX_NR_ZONES - 1, nodemask) { | 481 | MAX_NR_ZONES - 1, nodemask) { |
@@ -2205,7 +2205,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2205 | unsigned long sz = huge_page_size(h); | 2205 | unsigned long sz = huge_page_size(h); |
2206 | 2206 | ||
2207 | /* | 2207 | /* |
2208 | * A page gathering list, protected by per file i_mmap_lock. The | 2208 | * A page gathering list, protected by per file i_mmap_mutex. The |
2209 | * lock is used to avoid list corruption from multiple unmapping | 2209 | * lock is used to avoid list corruption from multiple unmapping |
2210 | * of the same page since we are using page->lru. | 2210 | * of the same page since we are using page->lru. |
2211 | */ | 2211 | */ |
@@ -2274,9 +2274,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2274 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2274 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
2275 | unsigned long end, struct page *ref_page) | 2275 | unsigned long end, struct page *ref_page) |
2276 | { | 2276 | { |
2277 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 2277 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2278 | __unmap_hugepage_range(vma, start, end, ref_page); | 2278 | __unmap_hugepage_range(vma, start, end, ref_page); |
2279 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | 2279 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2280 | } | 2280 | } |
2281 | 2281 | ||
2282 | /* | 2282 | /* |
@@ -2308,7 +2308,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2308 | * this mapping should be shared between all the VMAs, | 2308 | * this mapping should be shared between all the VMAs, |
2309 | * __unmap_hugepage_range() is called as the lock is already held | 2309 | * __unmap_hugepage_range() is called as the lock is already held |
2310 | */ | 2310 | */ |
2311 | spin_lock(&mapping->i_mmap_lock); | 2311 | mutex_lock(&mapping->i_mmap_mutex); |
2312 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2312 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
2313 | /* Do not unmap the current VMA */ | 2313 | /* Do not unmap the current VMA */ |
2314 | if (iter_vma == vma) | 2314 | if (iter_vma == vma) |
@@ -2326,7 +2326,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2326 | address, address + huge_page_size(h), | 2326 | address, address + huge_page_size(h), |
2327 | page); | 2327 | page); |
2328 | } | 2328 | } |
2329 | spin_unlock(&mapping->i_mmap_lock); | 2329 | mutex_unlock(&mapping->i_mmap_mutex); |
2330 | 2330 | ||
2331 | return 1; | 2331 | return 1; |
2332 | } | 2332 | } |
@@ -2810,7 +2810,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2810 | BUG_ON(address >= end); | 2810 | BUG_ON(address >= end); |
2811 | flush_cache_range(vma, address, end); | 2811 | flush_cache_range(vma, address, end); |
2812 | 2812 | ||
2813 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 2813 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2814 | spin_lock(&mm->page_table_lock); | 2814 | spin_lock(&mm->page_table_lock); |
2815 | for (; address < end; address += huge_page_size(h)) { | 2815 | for (; address < end; address += huge_page_size(h)) { |
2816 | ptep = huge_pte_offset(mm, address); | 2816 | ptep = huge_pte_offset(mm, address); |
@@ -2825,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2825 | } | 2825 | } |
2826 | } | 2826 | } |
2827 | spin_unlock(&mm->page_table_lock); | 2827 | spin_unlock(&mm->page_table_lock); |
2828 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | 2828 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2829 | 2829 | ||
2830 | flush_tlb_range(vma, start, end); | 2830 | flush_tlb_range(vma, start, end); |
2831 | } | 2831 | } |
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2833 | int hugetlb_reserve_pages(struct inode *inode, | 2833 | int hugetlb_reserve_pages(struct inode *inode, |
2834 | long from, long to, | 2834 | long from, long to, |
2835 | struct vm_area_struct *vma, | 2835 | struct vm_area_struct *vma, |
2836 | int acctflag) | 2836 | vm_flags_t vm_flags) |
2837 | { | 2837 | { |
2838 | long ret, chg; | 2838 | long ret, chg; |
2839 | struct hstate *h = hstate_inode(inode); | 2839 | struct hstate *h = hstate_inode(inode); |
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2843 | * attempt will be made for VM_NORESERVE to allocate a page | 2843 | * attempt will be made for VM_NORESERVE to allocate a page |
2844 | * and filesystem quota without using reserves | 2844 | * and filesystem quota without using reserves |
2845 | */ | 2845 | */ |
2846 | if (acctflag & VM_NORESERVE) | 2846 | if (vm_flags & VM_NORESERVE) |
2847 | return 0; | 2847 | return 0; |
2848 | 2848 | ||
2849 | /* | 2849 | /* |
diff --git a/mm/init-mm.c b/mm/init-mm.c index 1d29cdfe8ebb..4019979b2637 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c | |||
@@ -21,6 +21,5 @@ struct mm_struct init_mm = { | |||
21 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), | 21 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), |
22 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), | 22 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), |
23 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | 23 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), |
24 | .cpu_vm_mask = CPU_MASK_ALL, | ||
25 | INIT_MM_CONTEXT(init_mm) | 24 | INIT_MM_CONTEXT(init_mm) |
26 | }; | 25 | }; |
diff --git a/mm/internal.h b/mm/internal.h index 9d0ced8e505e..d071d380fb49 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -66,6 +66,10 @@ static inline unsigned long page_order(struct page *page) | |||
66 | return page_private(page); | 66 | return page_private(page); |
67 | } | 67 | } |
68 | 68 | ||
69 | /* mm/util.c */ | ||
70 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | ||
71 | struct vm_area_struct *prev, struct rb_node *rb_parent); | ||
72 | |||
69 | #ifdef CONFIG_MMU | 73 | #ifdef CONFIG_MMU |
70 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 74 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, |
71 | unsigned long start, unsigned long end); | 75 | unsigned long start, unsigned long end); |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c1d5867543e4..aacee45616fc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1414,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
1414 | ++(*pos); | 1414 | ++(*pos); |
1415 | 1415 | ||
1416 | list_for_each_continue_rcu(n, &object_list) { | 1416 | list_for_each_continue_rcu(n, &object_list) { |
1417 | next_obj = list_entry(n, struct kmemleak_object, object_list); | 1417 | struct kmemleak_object *obj = |
1418 | if (get_object(next_obj)) | 1418 | list_entry(n, struct kmemleak_object, object_list); |
1419 | if (get_object(obj)) { | ||
1420 | next_obj = obj; | ||
1419 | break; | 1421 | break; |
1422 | } | ||
1420 | } | 1423 | } |
1421 | 1424 | ||
1422 | put_object(prev_obj); | 1425 | put_object(prev_obj); |
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
37 | #include <linux/freezer.h> | 37 | #include <linux/freezer.h> |
38 | #include <linux/oom.h> | ||
38 | 39 | ||
39 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
40 | #include "internal.h" | 41 | #include "internal.h" |
@@ -1894,9 +1895,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1894 | if (ksm_run != flags) { | 1895 | if (ksm_run != flags) { |
1895 | ksm_run = flags; | 1896 | ksm_run = flags; |
1896 | if (flags & KSM_RUN_UNMERGE) { | 1897 | if (flags & KSM_RUN_UNMERGE) { |
1897 | current->flags |= PF_OOM_ORIGIN; | 1898 | int oom_score_adj; |
1899 | |||
1900 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | ||
1898 | err = unmerge_and_remove_all_rmap_items(); | 1901 | err = unmerge_and_remove_all_rmap_items(); |
1899 | current->flags &= ~PF_OOM_ORIGIN; | 1902 | test_set_oom_score_adj(oom_score_adj); |
1900 | if (err) { | 1903 | if (err) { |
1901 | ksm_run = KSM_RUN_STOP; | 1904 | ksm_run = KSM_RUN_STOP; |
1902 | count = err; | 1905 | count = err; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 010f9166fa6e..bd9052a5d3ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -94,6 +94,8 @@ enum mem_cgroup_events_index { | |||
94 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | 94 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ |
95 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | 95 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ |
96 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ | 96 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ |
97 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | ||
98 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | ||
97 | MEM_CGROUP_EVENTS_NSTATS, | 99 | MEM_CGROUP_EVENTS_NSTATS, |
98 | }; | 100 | }; |
99 | /* | 101 | /* |
@@ -231,6 +233,11 @@ struct mem_cgroup { | |||
231 | * reclaimed from. | 233 | * reclaimed from. |
232 | */ | 234 | */ |
233 | int last_scanned_child; | 235 | int last_scanned_child; |
236 | int last_scanned_node; | ||
237 | #if MAX_NUMNODES > 1 | ||
238 | nodemask_t scan_nodes; | ||
239 | unsigned long next_scan_node_update; | ||
240 | #endif | ||
234 | /* | 241 | /* |
235 | * Should the accounting and control be hierarchical, per subtree? | 242 | * Should the accounting and control be hierarchical, per subtree? |
236 | */ | 243 | */ |
@@ -585,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | |||
585 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 592 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
586 | } | 593 | } |
587 | 594 | ||
595 | void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) | ||
596 | { | ||
597 | this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); | ||
598 | } | ||
599 | |||
600 | void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) | ||
601 | { | ||
602 | this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); | ||
603 | } | ||
604 | |||
588 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, | 605 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, |
589 | enum mem_cgroup_events_index idx) | 606 | enum mem_cgroup_events_index idx) |
590 | { | 607 | { |
@@ -624,18 +641,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
624 | preempt_enable(); | 641 | preempt_enable(); |
625 | } | 642 | } |
626 | 643 | ||
644 | static unsigned long | ||
645 | mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) | ||
646 | { | ||
647 | struct mem_cgroup_per_zone *mz; | ||
648 | u64 total = 0; | ||
649 | int zid; | ||
650 | |||
651 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
652 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
653 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
654 | } | ||
655 | return total; | ||
656 | } | ||
627 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 657 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
628 | enum lru_list idx) | 658 | enum lru_list idx) |
629 | { | 659 | { |
630 | int nid, zid; | 660 | int nid; |
631 | struct mem_cgroup_per_zone *mz; | ||
632 | u64 total = 0; | 661 | u64 total = 0; |
633 | 662 | ||
634 | for_each_online_node(nid) | 663 | for_each_online_node(nid) |
635 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 664 | total += mem_cgroup_get_zonestat_node(mem, nid, idx); |
636 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
637 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
638 | } | ||
639 | return total; | 665 | return total; |
640 | } | 666 | } |
641 | 667 | ||
@@ -813,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | |||
813 | return (mem == root_mem_cgroup); | 839 | return (mem == root_mem_cgroup); |
814 | } | 840 | } |
815 | 841 | ||
842 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | ||
843 | { | ||
844 | struct mem_cgroup *mem; | ||
845 | |||
846 | if (!mm) | ||
847 | return; | ||
848 | |||
849 | rcu_read_lock(); | ||
850 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | ||
851 | if (unlikely(!mem)) | ||
852 | goto out; | ||
853 | |||
854 | switch (idx) { | ||
855 | case PGMAJFAULT: | ||
856 | mem_cgroup_pgmajfault(mem, 1); | ||
857 | break; | ||
858 | case PGFAULT: | ||
859 | mem_cgroup_pgfault(mem, 1); | ||
860 | break; | ||
861 | default: | ||
862 | BUG(); | ||
863 | } | ||
864 | out: | ||
865 | rcu_read_unlock(); | ||
866 | } | ||
867 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); | ||
868 | |||
816 | /* | 869 | /* |
817 | * Following LRU functions are allowed to be used without PCG_LOCK. | 870 | * Following LRU functions are allowed to be used without PCG_LOCK. |
818 | * Operations are called by routine of global LRU independently from memcg. | 871 | * Operations are called by routine of global LRU independently from memcg. |
@@ -1064,9 +1117,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | |||
1064 | return (active > inactive); | 1117 | return (active > inactive); |
1065 | } | 1118 | } |
1066 | 1119 | ||
1067 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 1120 | unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, |
1068 | struct zone *zone, | 1121 | struct zone *zone, |
1069 | enum lru_list lru) | 1122 | enum lru_list lru) |
1070 | { | 1123 | { |
1071 | int nid = zone_to_nid(zone); | 1124 | int nid = zone_to_nid(zone); |
1072 | int zid = zone_idx(zone); | 1125 | int zid = zone_idx(zone); |
@@ -1075,6 +1128,93 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | |||
1075 | return MEM_CGROUP_ZSTAT(mz, lru); | 1128 | return MEM_CGROUP_ZSTAT(mz, lru); |
1076 | } | 1129 | } |
1077 | 1130 | ||
1131 | #ifdef CONFIG_NUMA | ||
1132 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | ||
1133 | int nid) | ||
1134 | { | ||
1135 | unsigned long ret; | ||
1136 | |||
1137 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + | ||
1138 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); | ||
1139 | |||
1140 | return ret; | ||
1141 | } | ||
1142 | |||
1143 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | ||
1144 | { | ||
1145 | u64 total = 0; | ||
1146 | int nid; | ||
1147 | |||
1148 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1149 | total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); | ||
1150 | |||
1151 | return total; | ||
1152 | } | ||
1153 | |||
1154 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1155 | int nid) | ||
1156 | { | ||
1157 | unsigned long ret; | ||
1158 | |||
1159 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1160 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1161 | |||
1162 | return ret; | ||
1163 | } | ||
1164 | |||
1165 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) | ||
1166 | { | ||
1167 | u64 total = 0; | ||
1168 | int nid; | ||
1169 | |||
1170 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1171 | total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); | ||
1172 | |||
1173 | return total; | ||
1174 | } | ||
1175 | |||
1176 | static unsigned long | ||
1177 | mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) | ||
1178 | { | ||
1179 | return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); | ||
1180 | } | ||
1181 | |||
1182 | static unsigned long | ||
1183 | mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) | ||
1184 | { | ||
1185 | u64 total = 0; | ||
1186 | int nid; | ||
1187 | |||
1188 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1189 | total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); | ||
1190 | |||
1191 | return total; | ||
1192 | } | ||
1193 | |||
1194 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | ||
1195 | int nid) | ||
1196 | { | ||
1197 | enum lru_list l; | ||
1198 | u64 total = 0; | ||
1199 | |||
1200 | for_each_lru(l) | ||
1201 | total += mem_cgroup_get_zonestat_node(memcg, nid, l); | ||
1202 | |||
1203 | return total; | ||
1204 | } | ||
1205 | |||
1206 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) | ||
1207 | { | ||
1208 | u64 total = 0; | ||
1209 | int nid; | ||
1210 | |||
1211 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1212 | total += mem_cgroup_node_nr_lru_pages(memcg, nid); | ||
1213 | |||
1214 | return total; | ||
1215 | } | ||
1216 | #endif /* CONFIG_NUMA */ | ||
1217 | |||
1078 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | 1218 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
1079 | struct zone *zone) | 1219 | struct zone *zone) |
1080 | { | 1220 | { |
@@ -1418,6 +1558,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1418 | return ret; | 1558 | return ret; |
1419 | } | 1559 | } |
1420 | 1560 | ||
1561 | #if MAX_NUMNODES > 1 | ||
1562 | |||
1563 | /* | ||
1564 | * Always updating the nodemask is not very good - even if we have an empty | ||
1565 | * list or the wrong list here, we can start from some node and traverse all | ||
1566 | * nodes based on the zonelist. So update the list loosely once per 10 secs. | ||
1567 | * | ||
1568 | */ | ||
1569 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | ||
1570 | { | ||
1571 | int nid; | ||
1572 | |||
1573 | if (time_after(mem->next_scan_node_update, jiffies)) | ||
1574 | return; | ||
1575 | |||
1576 | mem->next_scan_node_update = jiffies + 10*HZ; | ||
1577 | /* make a nodemask where this memcg uses memory from */ | ||
1578 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | ||
1579 | |||
1580 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | ||
1581 | |||
1582 | if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || | ||
1583 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) | ||
1584 | continue; | ||
1585 | |||
1586 | if (total_swap_pages && | ||
1587 | (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || | ||
1588 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) | ||
1589 | continue; | ||
1590 | node_clear(nid, mem->scan_nodes); | ||
1591 | } | ||
1592 | } | ||
1593 | |||
1594 | /* | ||
1595 | * Selecting a node where we start reclaim from. Because what we need is just | ||
1596 | * reducing usage counter, start from anywhere is O,K. Considering | ||
1597 | * memory reclaim from current node, there are pros. and cons. | ||
1598 | * | ||
1599 | * Freeing memory from current node means freeing memory from a node which | ||
1600 | * we'll use or we've used. So, it may make LRU bad. And if several threads | ||
1601 | * hit limits, it will see a contention on a node. But freeing from remote | ||
1602 | * node means more costs for memory reclaim because of memory latency. | ||
1603 | * | ||
1604 | * Now, we use round-robin. Better algorithm is welcomed. | ||
1605 | */ | ||
1606 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | ||
1607 | { | ||
1608 | int node; | ||
1609 | |||
1610 | mem_cgroup_may_update_nodemask(mem); | ||
1611 | node = mem->last_scanned_node; | ||
1612 | |||
1613 | node = next_node(node, mem->scan_nodes); | ||
1614 | if (node == MAX_NUMNODES) | ||
1615 | node = first_node(mem->scan_nodes); | ||
1616 | /* | ||
1617 | * We call this when we hit limit, not when pages are added to LRU. | ||
1618 | * No LRU may hold pages because all pages are UNEVICTABLE or | ||
1619 | * memcg is too small and all pages are not on LRU. In that case, | ||
1620 | * we use curret node. | ||
1621 | */ | ||
1622 | if (unlikely(node == MAX_NUMNODES)) | ||
1623 | node = numa_node_id(); | ||
1624 | |||
1625 | mem->last_scanned_node = node; | ||
1626 | return node; | ||
1627 | } | ||
1628 | |||
1629 | #else | ||
1630 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | ||
1631 | { | ||
1632 | return 0; | ||
1633 | } | ||
1634 | #endif | ||
1635 | |||
1421 | /* | 1636 | /* |
1422 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1637 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
1423 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1638 | * we reclaimed from, so that we don't end up penalizing one child extensively |
@@ -1433,7 +1648,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1433 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1648 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
1434 | struct zone *zone, | 1649 | struct zone *zone, |
1435 | gfp_t gfp_mask, | 1650 | gfp_t gfp_mask, |
1436 | unsigned long reclaim_options) | 1651 | unsigned long reclaim_options, |
1652 | unsigned long *total_scanned) | ||
1437 | { | 1653 | { |
1438 | struct mem_cgroup *victim; | 1654 | struct mem_cgroup *victim; |
1439 | int ret, total = 0; | 1655 | int ret, total = 0; |
@@ -1442,6 +1658,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1442 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1658 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1443 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1659 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1444 | unsigned long excess; | 1660 | unsigned long excess; |
1661 | unsigned long nr_scanned; | ||
1445 | 1662 | ||
1446 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1663 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1447 | 1664 | ||
@@ -1484,10 +1701,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1484 | continue; | 1701 | continue; |
1485 | } | 1702 | } |
1486 | /* we use swappiness of local cgroup */ | 1703 | /* we use swappiness of local cgroup */ |
1487 | if (check_soft) | 1704 | if (check_soft) { |
1488 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1705 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1489 | noswap, get_swappiness(victim), zone); | 1706 | noswap, get_swappiness(victim), zone, |
1490 | else | 1707 | &nr_scanned); |
1708 | *total_scanned += nr_scanned; | ||
1709 | } else | ||
1491 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1710 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1492 | noswap, get_swappiness(victim)); | 1711 | noswap, get_swappiness(victim)); |
1493 | css_put(&victim->css); | 1712 | css_put(&victim->css); |
@@ -1503,7 +1722,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1503 | if (!res_counter_soft_limit_excess(&root_mem->res)) | 1722 | if (!res_counter_soft_limit_excess(&root_mem->res)) |
1504 | return total; | 1723 | return total; |
1505 | } else if (mem_cgroup_margin(root_mem)) | 1724 | } else if (mem_cgroup_margin(root_mem)) |
1506 | return 1 + total; | 1725 | return total; |
1507 | } | 1726 | } |
1508 | return total; | 1727 | return total; |
1509 | } | 1728 | } |
@@ -1928,7 +2147,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1928 | return CHARGE_WOULDBLOCK; | 2147 | return CHARGE_WOULDBLOCK; |
1929 | 2148 | ||
1930 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 2149 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1931 | gfp_mask, flags); | 2150 | gfp_mask, flags, NULL); |
1932 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2151 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
1933 | return CHARGE_RETRY; | 2152 | return CHARGE_RETRY; |
1934 | /* | 2153 | /* |
@@ -3211,7 +3430,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3211 | break; | 3430 | break; |
3212 | 3431 | ||
3213 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3432 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
3214 | MEM_CGROUP_RECLAIM_SHRINK); | 3433 | MEM_CGROUP_RECLAIM_SHRINK, |
3434 | NULL); | ||
3215 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3435 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
3216 | /* Usage is reduced ? */ | 3436 | /* Usage is reduced ? */ |
3217 | if (curusage >= oldusage) | 3437 | if (curusage >= oldusage) |
@@ -3271,7 +3491,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3271 | 3491 | ||
3272 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3492 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
3273 | MEM_CGROUP_RECLAIM_NOSWAP | | 3493 | MEM_CGROUP_RECLAIM_NOSWAP | |
3274 | MEM_CGROUP_RECLAIM_SHRINK); | 3494 | MEM_CGROUP_RECLAIM_SHRINK, |
3495 | NULL); | ||
3275 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3496 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3276 | /* Usage is reduced ? */ | 3497 | /* Usage is reduced ? */ |
3277 | if (curusage >= oldusage) | 3498 | if (curusage >= oldusage) |
@@ -3285,7 +3506,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3285 | } | 3506 | } |
3286 | 3507 | ||
3287 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 3508 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
3288 | gfp_t gfp_mask) | 3509 | gfp_t gfp_mask, |
3510 | unsigned long *total_scanned) | ||
3289 | { | 3511 | { |
3290 | unsigned long nr_reclaimed = 0; | 3512 | unsigned long nr_reclaimed = 0; |
3291 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | 3513 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; |
@@ -3293,6 +3515,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3293 | int loop = 0; | 3515 | int loop = 0; |
3294 | struct mem_cgroup_tree_per_zone *mctz; | 3516 | struct mem_cgroup_tree_per_zone *mctz; |
3295 | unsigned long long excess; | 3517 | unsigned long long excess; |
3518 | unsigned long nr_scanned; | ||
3296 | 3519 | ||
3297 | if (order > 0) | 3520 | if (order > 0) |
3298 | return 0; | 3521 | return 0; |
@@ -3311,10 +3534,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3311 | if (!mz) | 3534 | if (!mz) |
3312 | break; | 3535 | break; |
3313 | 3536 | ||
3537 | nr_scanned = 0; | ||
3314 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | 3538 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, |
3315 | gfp_mask, | 3539 | gfp_mask, |
3316 | MEM_CGROUP_RECLAIM_SOFT); | 3540 | MEM_CGROUP_RECLAIM_SOFT, |
3541 | &nr_scanned); | ||
3317 | nr_reclaimed += reclaimed; | 3542 | nr_reclaimed += reclaimed; |
3543 | *total_scanned += nr_scanned; | ||
3318 | spin_lock(&mctz->lock); | 3544 | spin_lock(&mctz->lock); |
3319 | 3545 | ||
3320 | /* | 3546 | /* |
@@ -3337,10 +3563,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3337 | */ | 3563 | */ |
3338 | next_mz = | 3564 | next_mz = |
3339 | __mem_cgroup_largest_soft_limit_node(mctz); | 3565 | __mem_cgroup_largest_soft_limit_node(mctz); |
3340 | if (next_mz == mz) { | 3566 | if (next_mz == mz) |
3341 | css_put(&next_mz->mem->css); | 3567 | css_put(&next_mz->mem->css); |
3342 | next_mz = NULL; | 3568 | else /* next_mz == NULL or other memcg */ |
3343 | } else /* next_mz == NULL or other memcg */ | ||
3344 | break; | 3569 | break; |
3345 | } while (1); | 3570 | } while (1); |
3346 | } | 3571 | } |
@@ -3772,6 +3997,8 @@ enum { | |||
3772 | MCS_PGPGIN, | 3997 | MCS_PGPGIN, |
3773 | MCS_PGPGOUT, | 3998 | MCS_PGPGOUT, |
3774 | MCS_SWAP, | 3999 | MCS_SWAP, |
4000 | MCS_PGFAULT, | ||
4001 | MCS_PGMAJFAULT, | ||
3775 | MCS_INACTIVE_ANON, | 4002 | MCS_INACTIVE_ANON, |
3776 | MCS_ACTIVE_ANON, | 4003 | MCS_ACTIVE_ANON, |
3777 | MCS_INACTIVE_FILE, | 4004 | MCS_INACTIVE_FILE, |
@@ -3794,6 +4021,8 @@ struct { | |||
3794 | {"pgpgin", "total_pgpgin"}, | 4021 | {"pgpgin", "total_pgpgin"}, |
3795 | {"pgpgout", "total_pgpgout"}, | 4022 | {"pgpgout", "total_pgpgout"}, |
3796 | {"swap", "total_swap"}, | 4023 | {"swap", "total_swap"}, |
4024 | {"pgfault", "total_pgfault"}, | ||
4025 | {"pgmajfault", "total_pgmajfault"}, | ||
3797 | {"inactive_anon", "total_inactive_anon"}, | 4026 | {"inactive_anon", "total_inactive_anon"}, |
3798 | {"active_anon", "total_active_anon"}, | 4027 | {"active_anon", "total_active_anon"}, |
3799 | {"inactive_file", "total_inactive_file"}, | 4028 | {"inactive_file", "total_inactive_file"}, |
@@ -3822,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | |||
3822 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 4051 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
3823 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 4052 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
3824 | } | 4053 | } |
4054 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); | ||
4055 | s->stat[MCS_PGFAULT] += val; | ||
4056 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); | ||
4057 | s->stat[MCS_PGMAJFAULT] += val; | ||
3825 | 4058 | ||
3826 | /* per zone stat */ | 4059 | /* per zone stat */ |
3827 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 4060 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); |
@@ -3845,6 +4078,51 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | |||
3845 | mem_cgroup_get_local_stat(iter, s); | 4078 | mem_cgroup_get_local_stat(iter, s); |
3846 | } | 4079 | } |
3847 | 4080 | ||
4081 | #ifdef CONFIG_NUMA | ||
4082 | static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | ||
4083 | { | ||
4084 | int nid; | ||
4085 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | ||
4086 | unsigned long node_nr; | ||
4087 | struct cgroup *cont = m->private; | ||
4088 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | ||
4089 | |||
4090 | total_nr = mem_cgroup_nr_lru_pages(mem_cont); | ||
4091 | seq_printf(m, "total=%lu", total_nr); | ||
4092 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
4093 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); | ||
4094 | seq_printf(m, " N%d=%lu", nid, node_nr); | ||
4095 | } | ||
4096 | seq_putc(m, '\n'); | ||
4097 | |||
4098 | file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); | ||
4099 | seq_printf(m, "file=%lu", file_nr); | ||
4100 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
4101 | node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); | ||
4102 | seq_printf(m, " N%d=%lu", nid, node_nr); | ||
4103 | } | ||
4104 | seq_putc(m, '\n'); | ||
4105 | |||
4106 | anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); | ||
4107 | seq_printf(m, "anon=%lu", anon_nr); | ||
4108 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
4109 | node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); | ||
4110 | seq_printf(m, " N%d=%lu", nid, node_nr); | ||
4111 | } | ||
4112 | seq_putc(m, '\n'); | ||
4113 | |||
4114 | unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); | ||
4115 | seq_printf(m, "unevictable=%lu", unevictable_nr); | ||
4116 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
4117 | node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, | ||
4118 | nid); | ||
4119 | seq_printf(m, " N%d=%lu", nid, node_nr); | ||
4120 | } | ||
4121 | seq_putc(m, '\n'); | ||
4122 | return 0; | ||
4123 | } | ||
4124 | #endif /* CONFIG_NUMA */ | ||
4125 | |||
3848 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4126 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
3849 | struct cgroup_map_cb *cb) | 4127 | struct cgroup_map_cb *cb) |
3850 | { | 4128 | { |
@@ -3855,6 +4133,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
3855 | memset(&mystat, 0, sizeof(mystat)); | 4133 | memset(&mystat, 0, sizeof(mystat)); |
3856 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 4134 | mem_cgroup_get_local_stat(mem_cont, &mystat); |
3857 | 4135 | ||
4136 | |||
3858 | for (i = 0; i < NR_MCS_STAT; i++) { | 4137 | for (i = 0; i < NR_MCS_STAT; i++) { |
3859 | if (i == MCS_SWAP && !do_swap_account) | 4138 | if (i == MCS_SWAP && !do_swap_account) |
3860 | continue; | 4139 | continue; |
@@ -4278,6 +4557,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4278 | return 0; | 4557 | return 0; |
4279 | } | 4558 | } |
4280 | 4559 | ||
4560 | #ifdef CONFIG_NUMA | ||
4561 | static const struct file_operations mem_control_numa_stat_file_operations = { | ||
4562 | .read = seq_read, | ||
4563 | .llseek = seq_lseek, | ||
4564 | .release = single_release, | ||
4565 | }; | ||
4566 | |||
4567 | static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | ||
4568 | { | ||
4569 | struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; | ||
4570 | |||
4571 | file->f_op = &mem_control_numa_stat_file_operations; | ||
4572 | return single_open(file, mem_control_numa_stat_show, cont); | ||
4573 | } | ||
4574 | #endif /* CONFIG_NUMA */ | ||
4575 | |||
4281 | static struct cftype mem_cgroup_files[] = { | 4576 | static struct cftype mem_cgroup_files[] = { |
4282 | { | 4577 | { |
4283 | .name = "usage_in_bytes", | 4578 | .name = "usage_in_bytes", |
@@ -4341,6 +4636,12 @@ static struct cftype mem_cgroup_files[] = { | |||
4341 | .unregister_event = mem_cgroup_oom_unregister_event, | 4636 | .unregister_event = mem_cgroup_oom_unregister_event, |
4342 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 4637 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
4343 | }, | 4638 | }, |
4639 | #ifdef CONFIG_NUMA | ||
4640 | { | ||
4641 | .name = "numa_stat", | ||
4642 | .open = mem_control_numa_stat_open, | ||
4643 | }, | ||
4644 | #endif | ||
4344 | }; | 4645 | }; |
4345 | 4646 | ||
4346 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4647 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -4596,6 +4897,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4596 | res_counter_init(&mem->memsw, NULL); | 4897 | res_counter_init(&mem->memsw, NULL); |
4597 | } | 4898 | } |
4598 | mem->last_scanned_child = 0; | 4899 | mem->last_scanned_child = 0; |
4900 | mem->last_scanned_node = MAX_NUMNODES; | ||
4599 | INIT_LIST_HEAD(&mem->oom_notify); | 4901 | INIT_LIST_HEAD(&mem->oom_notify); |
4600 | 4902 | ||
4601 | if (parent) | 4903 | if (parent) |
@@ -4953,8 +5255,7 @@ static void mem_cgroup_clear_mc(void) | |||
4953 | 5255 | ||
4954 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5256 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
4955 | struct cgroup *cgroup, | 5257 | struct cgroup *cgroup, |
4956 | struct task_struct *p, | 5258 | struct task_struct *p) |
4957 | bool threadgroup) | ||
4958 | { | 5259 | { |
4959 | int ret = 0; | 5260 | int ret = 0; |
4960 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | 5261 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); |
@@ -4993,8 +5294,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4993 | 5294 | ||
4994 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | 5295 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, |
4995 | struct cgroup *cgroup, | 5296 | struct cgroup *cgroup, |
4996 | struct task_struct *p, | 5297 | struct task_struct *p) |
4997 | bool threadgroup) | ||
4998 | { | 5298 | { |
4999 | mem_cgroup_clear_mc(); | 5299 | mem_cgroup_clear_mc(); |
5000 | } | 5300 | } |
@@ -5112,8 +5412,7 @@ retry: | |||
5112 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 5412 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
5113 | struct cgroup *cont, | 5413 | struct cgroup *cont, |
5114 | struct cgroup *old_cont, | 5414 | struct cgroup *old_cont, |
5115 | struct task_struct *p, | 5415 | struct task_struct *p) |
5116 | bool threadgroup) | ||
5117 | { | 5416 | { |
5118 | struct mm_struct *mm; | 5417 | struct mm_struct *mm; |
5119 | 5418 | ||
@@ -5131,22 +5430,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
5131 | #else /* !CONFIG_MMU */ | 5430 | #else /* !CONFIG_MMU */ |
5132 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5431 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
5133 | struct cgroup *cgroup, | 5432 | struct cgroup *cgroup, |
5134 | struct task_struct *p, | 5433 | struct task_struct *p) |
5135 | bool threadgroup) | ||
5136 | { | 5434 | { |
5137 | return 0; | 5435 | return 0; |
5138 | } | 5436 | } |
5139 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | 5437 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, |
5140 | struct cgroup *cgroup, | 5438 | struct cgroup *cgroup, |
5141 | struct task_struct *p, | 5439 | struct task_struct *p) |
5142 | bool threadgroup) | ||
5143 | { | 5440 | { |
5144 | } | 5441 | } |
5145 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 5442 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
5146 | struct cgroup *cont, | 5443 | struct cgroup *cont, |
5147 | struct cgroup *old_cont, | 5444 | struct cgroup *old_cont, |
5148 | struct task_struct *p, | 5445 | struct task_struct *p) |
5149 | bool threadgroup) | ||
5150 | { | 5446 | { |
5151 | } | 5447 | } |
5152 | #endif | 5448 | #endif |
@@ -5169,19 +5465,12 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5169 | static int __init enable_swap_account(char *s) | 5465 | static int __init enable_swap_account(char *s) |
5170 | { | 5466 | { |
5171 | /* consider enabled if no parameter or 1 is given */ | 5467 | /* consider enabled if no parameter or 1 is given */ |
5172 | if (!(*s) || !strcmp(s, "=1")) | 5468 | if (!strcmp(s, "1")) |
5173 | really_do_swap_account = 1; | 5469 | really_do_swap_account = 1; |
5174 | else if (!strcmp(s, "=0")) | 5470 | else if (!strcmp(s, "0")) |
5175 | really_do_swap_account = 0; | 5471 | really_do_swap_account = 0; |
5176 | return 1; | 5472 | return 1; |
5177 | } | 5473 | } |
5178 | __setup("swapaccount", enable_swap_account); | 5474 | __setup("swapaccount=", enable_swap_account); |
5179 | 5475 | ||
5180 | static int __init disable_swap_account(char *s) | ||
5181 | { | ||
5182 | printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); | ||
5183 | enable_swap_account("=0"); | ||
5184 | return 1; | ||
5185 | } | ||
5186 | __setup("noswapaccount", disable_swap_account); | ||
5187 | #endif | 5476 | #endif |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2b9a5eef39e0..5c8f7e08928d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -239,7 +239,11 @@ void shake_page(struct page *p, int access) | |||
239 | if (access) { | 239 | if (access) { |
240 | int nr; | 240 | int nr; |
241 | do { | 241 | do { |
242 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | 242 | struct shrink_control shrink = { |
243 | .gfp_mask = GFP_KERNEL, | ||
244 | }; | ||
245 | |||
246 | nr = shrink_slab(&shrink, 1000, 1000); | ||
243 | if (page_count(p) == 1) | 247 | if (page_count(p) == 1) |
244 | break; | 248 | break; |
245 | } while (nr > 10); | 249 | } while (nr > 10); |
@@ -429,7 +433,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
429 | */ | 433 | */ |
430 | 434 | ||
431 | read_lock(&tasklist_lock); | 435 | read_lock(&tasklist_lock); |
432 | spin_lock(&mapping->i_mmap_lock); | 436 | mutex_lock(&mapping->i_mmap_mutex); |
433 | for_each_process(tsk) { | 437 | for_each_process(tsk) { |
434 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 438 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
435 | 439 | ||
@@ -449,7 +453,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
449 | add_to_kill(tsk, page, vma, to_kill, tkc); | 453 | add_to_kill(tsk, page, vma, to_kill, tkc); |
450 | } | 454 | } |
451 | } | 455 | } |
452 | spin_unlock(&mapping->i_mmap_lock); | 456 | mutex_unlock(&mapping->i_mmap_mutex); |
453 | read_unlock(&tasklist_lock); | 457 | read_unlock(&tasklist_lock); |
454 | } | 458 | } |
455 | 459 | ||
@@ -1440,16 +1444,12 @@ int soft_offline_page(struct page *page, int flags) | |||
1440 | */ | 1444 | */ |
1441 | ret = invalidate_inode_page(page); | 1445 | ret = invalidate_inode_page(page); |
1442 | unlock_page(page); | 1446 | unlock_page(page); |
1443 | |||
1444 | /* | 1447 | /* |
1445 | * Drop count because page migration doesn't like raised | ||
1446 | * counts. The page could get re-allocated, but if it becomes | ||
1447 | * LRU the isolation will just fail. | ||
1448 | * RED-PEN would be better to keep it isolated here, but we | 1448 | * RED-PEN would be better to keep it isolated here, but we |
1449 | * would need to fix isolation locking first. | 1449 | * would need to fix isolation locking first. |
1450 | */ | 1450 | */ |
1451 | put_page(page); | ||
1452 | if (ret == 1) { | 1451 | if (ret == 1) { |
1452 | put_page(page); | ||
1453 | ret = 0; | 1453 | ret = 0; |
1454 | pr_info("soft_offline: %#lx: invalidated\n", pfn); | 1454 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
1455 | goto done; | 1455 | goto done; |
@@ -1461,6 +1461,11 @@ int soft_offline_page(struct page *page, int flags) | |||
1461 | * handles a large number of cases for us. | 1461 | * handles a large number of cases for us. |
1462 | */ | 1462 | */ |
1463 | ret = isolate_lru_page(page); | 1463 | ret = isolate_lru_page(page); |
1464 | /* | ||
1465 | * Drop page reference which is came from get_any_page() | ||
1466 | * successful isolate_lru_page() already took another one. | ||
1467 | */ | ||
1468 | put_page(page); | ||
1464 | if (!ret) { | 1469 | if (!ret) { |
1465 | LIST_HEAD(pagelist); | 1470 | LIST_HEAD(pagelist); |
1466 | 1471 | ||
diff --git a/mm/memory.c b/mm/memory.c index 61e66f026563..6953d3926e01 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | |||
182 | { | 182 | { |
183 | __sync_task_rss_stat(task, mm); | 183 | __sync_task_rss_stat(task, mm); |
184 | } | 184 | } |
185 | #else | 185 | #else /* SPLIT_RSS_COUNTING */ |
186 | 186 | ||
187 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | 187 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) |
188 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | 188 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) |
@@ -191,8 +191,205 @@ static void check_sync_rss_stat(struct task_struct *task) | |||
191 | { | 191 | { |
192 | } | 192 | } |
193 | 193 | ||
194 | #endif /* SPLIT_RSS_COUNTING */ | ||
195 | |||
196 | #ifdef HAVE_GENERIC_MMU_GATHER | ||
197 | |||
198 | static int tlb_next_batch(struct mmu_gather *tlb) | ||
199 | { | ||
200 | struct mmu_gather_batch *batch; | ||
201 | |||
202 | batch = tlb->active; | ||
203 | if (batch->next) { | ||
204 | tlb->active = batch->next; | ||
205 | return 1; | ||
206 | } | ||
207 | |||
208 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); | ||
209 | if (!batch) | ||
210 | return 0; | ||
211 | |||
212 | batch->next = NULL; | ||
213 | batch->nr = 0; | ||
214 | batch->max = MAX_GATHER_BATCH; | ||
215 | |||
216 | tlb->active->next = batch; | ||
217 | tlb->active = batch; | ||
218 | |||
219 | return 1; | ||
220 | } | ||
221 | |||
222 | /* tlb_gather_mmu | ||
223 | * Called to initialize an (on-stack) mmu_gather structure for page-table | ||
224 | * tear-down from @mm. The @fullmm argument is used when @mm is without | ||
225 | * users and we're going to destroy the full address space (exit/execve). | ||
226 | */ | ||
227 | void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) | ||
228 | { | ||
229 | tlb->mm = mm; | ||
230 | |||
231 | tlb->fullmm = fullmm; | ||
232 | tlb->need_flush = 0; | ||
233 | tlb->fast_mode = (num_possible_cpus() == 1); | ||
234 | tlb->local.next = NULL; | ||
235 | tlb->local.nr = 0; | ||
236 | tlb->local.max = ARRAY_SIZE(tlb->__pages); | ||
237 | tlb->active = &tlb->local; | ||
238 | |||
239 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
240 | tlb->batch = NULL; | ||
241 | #endif | ||
242 | } | ||
243 | |||
244 | void tlb_flush_mmu(struct mmu_gather *tlb) | ||
245 | { | ||
246 | struct mmu_gather_batch *batch; | ||
247 | |||
248 | if (!tlb->need_flush) | ||
249 | return; | ||
250 | tlb->need_flush = 0; | ||
251 | tlb_flush(tlb); | ||
252 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
253 | tlb_table_flush(tlb); | ||
194 | #endif | 254 | #endif |
195 | 255 | ||
256 | if (tlb_fast_mode(tlb)) | ||
257 | return; | ||
258 | |||
259 | for (batch = &tlb->local; batch; batch = batch->next) { | ||
260 | free_pages_and_swap_cache(batch->pages, batch->nr); | ||
261 | batch->nr = 0; | ||
262 | } | ||
263 | tlb->active = &tlb->local; | ||
264 | } | ||
265 | |||
266 | /* tlb_finish_mmu | ||
267 | * Called at the end of the shootdown operation to free up any resources | ||
268 | * that were required. | ||
269 | */ | ||
270 | void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | ||
271 | { | ||
272 | struct mmu_gather_batch *batch, *next; | ||
273 | |||
274 | tlb_flush_mmu(tlb); | ||
275 | |||
276 | /* keep the page table cache within bounds */ | ||
277 | check_pgt_cache(); | ||
278 | |||
279 | for (batch = tlb->local.next; batch; batch = next) { | ||
280 | next = batch->next; | ||
281 | free_pages((unsigned long)batch, 0); | ||
282 | } | ||
283 | tlb->local.next = NULL; | ||
284 | } | ||
285 | |||
286 | /* __tlb_remove_page | ||
287 | * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while | ||
288 | * handling the additional races in SMP caused by other CPUs caching valid | ||
289 | * mappings in their TLBs. Returns the number of free page slots left. | ||
290 | * When out of page slots we must call tlb_flush_mmu(). | ||
291 | */ | ||
292 | int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | ||
293 | { | ||
294 | struct mmu_gather_batch *batch; | ||
295 | |||
296 | tlb->need_flush = 1; | ||
297 | |||
298 | if (tlb_fast_mode(tlb)) { | ||
299 | free_page_and_swap_cache(page); | ||
300 | return 1; /* avoid calling tlb_flush_mmu() */ | ||
301 | } | ||
302 | |||
303 | batch = tlb->active; | ||
304 | batch->pages[batch->nr++] = page; | ||
305 | if (batch->nr == batch->max) { | ||
306 | if (!tlb_next_batch(tlb)) | ||
307 | return 0; | ||
308 | } | ||
309 | VM_BUG_ON(batch->nr > batch->max); | ||
310 | |||
311 | return batch->max - batch->nr; | ||
312 | } | ||
313 | |||
314 | #endif /* HAVE_GENERIC_MMU_GATHER */ | ||
315 | |||
316 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
317 | |||
318 | /* | ||
319 | * See the comment near struct mmu_table_batch. | ||
320 | */ | ||
321 | |||
322 | static void tlb_remove_table_smp_sync(void *arg) | ||
323 | { | ||
324 | /* Simply deliver the interrupt */ | ||
325 | } | ||
326 | |||
327 | static void tlb_remove_table_one(void *table) | ||
328 | { | ||
329 | /* | ||
330 | * This isn't an RCU grace period and hence the page-tables cannot be | ||
331 | * assumed to be actually RCU-freed. | ||
332 | * | ||
333 | * It is however sufficient for software page-table walkers that rely on | ||
334 | * IRQ disabling. See the comment near struct mmu_table_batch. | ||
335 | */ | ||
336 | smp_call_function(tlb_remove_table_smp_sync, NULL, 1); | ||
337 | __tlb_remove_table(table); | ||
338 | } | ||
339 | |||
340 | static void tlb_remove_table_rcu(struct rcu_head *head) | ||
341 | { | ||
342 | struct mmu_table_batch *batch; | ||
343 | int i; | ||
344 | |||
345 | batch = container_of(head, struct mmu_table_batch, rcu); | ||
346 | |||
347 | for (i = 0; i < batch->nr; i++) | ||
348 | __tlb_remove_table(batch->tables[i]); | ||
349 | |||
350 | free_page((unsigned long)batch); | ||
351 | } | ||
352 | |||
353 | void tlb_table_flush(struct mmu_gather *tlb) | ||
354 | { | ||
355 | struct mmu_table_batch **batch = &tlb->batch; | ||
356 | |||
357 | if (*batch) { | ||
358 | call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); | ||
359 | *batch = NULL; | ||
360 | } | ||
361 | } | ||
362 | |||
363 | void tlb_remove_table(struct mmu_gather *tlb, void *table) | ||
364 | { | ||
365 | struct mmu_table_batch **batch = &tlb->batch; | ||
366 | |||
367 | tlb->need_flush = 1; | ||
368 | |||
369 | /* | ||
370 | * When there's less then two users of this mm there cannot be a | ||
371 | * concurrent page-table walk. | ||
372 | */ | ||
373 | if (atomic_read(&tlb->mm->mm_users) < 2) { | ||
374 | __tlb_remove_table(table); | ||
375 | return; | ||
376 | } | ||
377 | |||
378 | if (*batch == NULL) { | ||
379 | *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); | ||
380 | if (*batch == NULL) { | ||
381 | tlb_remove_table_one(table); | ||
382 | return; | ||
383 | } | ||
384 | (*batch)->nr = 0; | ||
385 | } | ||
386 | (*batch)->tables[(*batch)->nr++] = table; | ||
387 | if ((*batch)->nr == MAX_TABLE_BATCH) | ||
388 | tlb_table_flush(tlb); | ||
389 | } | ||
390 | |||
391 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
392 | |||
196 | /* | 393 | /* |
197 | * If a p?d_bad entry is found while walking page tables, report | 394 | * If a p?d_bad entry is found while walking page tables, report |
198 | * the error, before resetting entry to p?d_none. Usually (but | 395 | * the error, before resetting entry to p?d_none. Usually (but |
@@ -533,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
533 | add_taint(TAINT_BAD_PAGE); | 730 | add_taint(TAINT_BAD_PAGE); |
534 | } | 731 | } |
535 | 732 | ||
536 | static inline int is_cow_mapping(unsigned int flags) | 733 | static inline int is_cow_mapping(vm_flags_t flags) |
537 | { | 734 | { |
538 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 735 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
539 | } | 736 | } |
@@ -909,26 +1106,24 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
909 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 1106 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
910 | struct vm_area_struct *vma, pmd_t *pmd, | 1107 | struct vm_area_struct *vma, pmd_t *pmd, |
911 | unsigned long addr, unsigned long end, | 1108 | unsigned long addr, unsigned long end, |
912 | long *zap_work, struct zap_details *details) | 1109 | struct zap_details *details) |
913 | { | 1110 | { |
914 | struct mm_struct *mm = tlb->mm; | 1111 | struct mm_struct *mm = tlb->mm; |
915 | pte_t *pte; | 1112 | int force_flush = 0; |
916 | spinlock_t *ptl; | ||
917 | int rss[NR_MM_COUNTERS]; | 1113 | int rss[NR_MM_COUNTERS]; |
1114 | spinlock_t *ptl; | ||
1115 | pte_t *pte; | ||
918 | 1116 | ||
1117 | again: | ||
919 | init_rss_vec(rss); | 1118 | init_rss_vec(rss); |
920 | |||
921 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1119 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
922 | arch_enter_lazy_mmu_mode(); | 1120 | arch_enter_lazy_mmu_mode(); |
923 | do { | 1121 | do { |
924 | pte_t ptent = *pte; | 1122 | pte_t ptent = *pte; |
925 | if (pte_none(ptent)) { | 1123 | if (pte_none(ptent)) { |
926 | (*zap_work)--; | ||
927 | continue; | 1124 | continue; |
928 | } | 1125 | } |
929 | 1126 | ||
930 | (*zap_work) -= PAGE_SIZE; | ||
931 | |||
932 | if (pte_present(ptent)) { | 1127 | if (pte_present(ptent)) { |
933 | struct page *page; | 1128 | struct page *page; |
934 | 1129 | ||
@@ -974,7 +1169,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
974 | page_remove_rmap(page); | 1169 | page_remove_rmap(page); |
975 | if (unlikely(page_mapcount(page) < 0)) | 1170 | if (unlikely(page_mapcount(page) < 0)) |
976 | print_bad_pte(vma, addr, ptent, page); | 1171 | print_bad_pte(vma, addr, ptent, page); |
977 | tlb_remove_page(tlb, page); | 1172 | force_flush = !__tlb_remove_page(tlb, page); |
1173 | if (force_flush) | ||
1174 | break; | ||
978 | continue; | 1175 | continue; |
979 | } | 1176 | } |
980 | /* | 1177 | /* |
@@ -995,19 +1192,31 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
995 | print_bad_pte(vma, addr, ptent, NULL); | 1192 | print_bad_pte(vma, addr, ptent, NULL); |
996 | } | 1193 | } |
997 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 1194 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
998 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 1195 | } while (pte++, addr += PAGE_SIZE, addr != end); |
999 | 1196 | ||
1000 | add_mm_rss_vec(mm, rss); | 1197 | add_mm_rss_vec(mm, rss); |
1001 | arch_leave_lazy_mmu_mode(); | 1198 | arch_leave_lazy_mmu_mode(); |
1002 | pte_unmap_unlock(pte - 1, ptl); | 1199 | pte_unmap_unlock(pte - 1, ptl); |
1003 | 1200 | ||
1201 | /* | ||
1202 | * mmu_gather ran out of room to batch pages, we break out of | ||
1203 | * the PTE lock to avoid doing the potential expensive TLB invalidate | ||
1204 | * and page-free while holding it. | ||
1205 | */ | ||
1206 | if (force_flush) { | ||
1207 | force_flush = 0; | ||
1208 | tlb_flush_mmu(tlb); | ||
1209 | if (addr != end) | ||
1210 | goto again; | ||
1211 | } | ||
1212 | |||
1004 | return addr; | 1213 | return addr; |
1005 | } | 1214 | } |
1006 | 1215 | ||
1007 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | 1216 | static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, |
1008 | struct vm_area_struct *vma, pud_t *pud, | 1217 | struct vm_area_struct *vma, pud_t *pud, |
1009 | unsigned long addr, unsigned long end, | 1218 | unsigned long addr, unsigned long end, |
1010 | long *zap_work, struct zap_details *details) | 1219 | struct zap_details *details) |
1011 | { | 1220 | { |
1012 | pmd_t *pmd; | 1221 | pmd_t *pmd; |
1013 | unsigned long next; | 1222 | unsigned long next; |
@@ -1019,19 +1228,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1019 | if (next-addr != HPAGE_PMD_SIZE) { | 1228 | if (next-addr != HPAGE_PMD_SIZE) { |
1020 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1229 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1021 | split_huge_page_pmd(vma->vm_mm, pmd); | 1230 | split_huge_page_pmd(vma->vm_mm, pmd); |
1022 | } else if (zap_huge_pmd(tlb, vma, pmd)) { | 1231 | } else if (zap_huge_pmd(tlb, vma, pmd)) |
1023 | (*zap_work)--; | ||
1024 | continue; | 1232 | continue; |
1025 | } | ||
1026 | /* fall through */ | 1233 | /* fall through */ |
1027 | } | 1234 | } |
1028 | if (pmd_none_or_clear_bad(pmd)) { | 1235 | if (pmd_none_or_clear_bad(pmd)) |
1029 | (*zap_work)--; | ||
1030 | continue; | 1236 | continue; |
1031 | } | 1237 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1032 | next = zap_pte_range(tlb, vma, pmd, addr, next, | 1238 | cond_resched(); |
1033 | zap_work, details); | 1239 | } while (pmd++, addr = next, addr != end); |
1034 | } while (pmd++, addr = next, (addr != end && *zap_work > 0)); | ||
1035 | 1240 | ||
1036 | return addr; | 1241 | return addr; |
1037 | } | 1242 | } |
@@ -1039,7 +1244,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1039 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | 1244 | static inline unsigned long zap_pud_range(struct mmu_gather *tlb, |
1040 | struct vm_area_struct *vma, pgd_t *pgd, | 1245 | struct vm_area_struct *vma, pgd_t *pgd, |
1041 | unsigned long addr, unsigned long end, | 1246 | unsigned long addr, unsigned long end, |
1042 | long *zap_work, struct zap_details *details) | 1247 | struct zap_details *details) |
1043 | { | 1248 | { |
1044 | pud_t *pud; | 1249 | pud_t *pud; |
1045 | unsigned long next; | 1250 | unsigned long next; |
@@ -1047,13 +1252,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
1047 | pud = pud_offset(pgd, addr); | 1252 | pud = pud_offset(pgd, addr); |
1048 | do { | 1253 | do { |
1049 | next = pud_addr_end(addr, end); | 1254 | next = pud_addr_end(addr, end); |
1050 | if (pud_none_or_clear_bad(pud)) { | 1255 | if (pud_none_or_clear_bad(pud)) |
1051 | (*zap_work)--; | ||
1052 | continue; | 1256 | continue; |
1053 | } | 1257 | next = zap_pmd_range(tlb, vma, pud, addr, next, details); |
1054 | next = zap_pmd_range(tlb, vma, pud, addr, next, | 1258 | } while (pud++, addr = next, addr != end); |
1055 | zap_work, details); | ||
1056 | } while (pud++, addr = next, (addr != end && *zap_work > 0)); | ||
1057 | 1259 | ||
1058 | return addr; | 1260 | return addr; |
1059 | } | 1261 | } |
@@ -1061,7 +1263,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
1061 | static unsigned long unmap_page_range(struct mmu_gather *tlb, | 1263 | static unsigned long unmap_page_range(struct mmu_gather *tlb, |
1062 | struct vm_area_struct *vma, | 1264 | struct vm_area_struct *vma, |
1063 | unsigned long addr, unsigned long end, | 1265 | unsigned long addr, unsigned long end, |
1064 | long *zap_work, struct zap_details *details) | 1266 | struct zap_details *details) |
1065 | { | 1267 | { |
1066 | pgd_t *pgd; | 1268 | pgd_t *pgd; |
1067 | unsigned long next; | 1269 | unsigned long next; |
@@ -1075,13 +1277,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1075 | pgd = pgd_offset(vma->vm_mm, addr); | 1277 | pgd = pgd_offset(vma->vm_mm, addr); |
1076 | do { | 1278 | do { |
1077 | next = pgd_addr_end(addr, end); | 1279 | next = pgd_addr_end(addr, end); |
1078 | if (pgd_none_or_clear_bad(pgd)) { | 1280 | if (pgd_none_or_clear_bad(pgd)) |
1079 | (*zap_work)--; | ||
1080 | continue; | 1281 | continue; |
1081 | } | 1282 | next = zap_pud_range(tlb, vma, pgd, addr, next, details); |
1082 | next = zap_pud_range(tlb, vma, pgd, addr, next, | 1283 | } while (pgd++, addr = next, addr != end); |
1083 | zap_work, details); | ||
1084 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | ||
1085 | tlb_end_vma(tlb, vma); | 1284 | tlb_end_vma(tlb, vma); |
1086 | mem_cgroup_uncharge_end(); | 1285 | mem_cgroup_uncharge_end(); |
1087 | 1286 | ||
@@ -1121,17 +1320,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1121 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | 1320 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
1122 | * drops the lock and schedules. | 1321 | * drops the lock and schedules. |
1123 | */ | 1322 | */ |
1124 | unsigned long unmap_vmas(struct mmu_gather **tlbp, | 1323 | unsigned long unmap_vmas(struct mmu_gather *tlb, |
1125 | struct vm_area_struct *vma, unsigned long start_addr, | 1324 | struct vm_area_struct *vma, unsigned long start_addr, |
1126 | unsigned long end_addr, unsigned long *nr_accounted, | 1325 | unsigned long end_addr, unsigned long *nr_accounted, |
1127 | struct zap_details *details) | 1326 | struct zap_details *details) |
1128 | { | 1327 | { |
1129 | long zap_work = ZAP_BLOCK_SIZE; | ||
1130 | unsigned long tlb_start = 0; /* For tlb_finish_mmu */ | ||
1131 | int tlb_start_valid = 0; | ||
1132 | unsigned long start = start_addr; | 1328 | unsigned long start = start_addr; |
1133 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | ||
1134 | int fullmm = (*tlbp)->fullmm; | ||
1135 | struct mm_struct *mm = vma->vm_mm; | 1329 | struct mm_struct *mm = vma->vm_mm; |
1136 | 1330 | ||
1137 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | 1331 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); |
@@ -1152,11 +1346,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
1152 | untrack_pfn_vma(vma, 0, 0); | 1346 | untrack_pfn_vma(vma, 0, 0); |
1153 | 1347 | ||
1154 | while (start != end) { | 1348 | while (start != end) { |
1155 | if (!tlb_start_valid) { | ||
1156 | tlb_start = start; | ||
1157 | tlb_start_valid = 1; | ||
1158 | } | ||
1159 | |||
1160 | if (unlikely(is_vm_hugetlb_page(vma))) { | 1349 | if (unlikely(is_vm_hugetlb_page(vma))) { |
1161 | /* | 1350 | /* |
1162 | * It is undesirable to test vma->vm_file as it | 1351 | * It is undesirable to test vma->vm_file as it |
@@ -1169,39 +1358,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
1169 | * Since no pte has actually been setup, it is | 1358 | * Since no pte has actually been setup, it is |
1170 | * safe to do nothing in this case. | 1359 | * safe to do nothing in this case. |
1171 | */ | 1360 | */ |
1172 | if (vma->vm_file) { | 1361 | if (vma->vm_file) |
1173 | unmap_hugepage_range(vma, start, end, NULL); | 1362 | unmap_hugepage_range(vma, start, end, NULL); |
1174 | zap_work -= (end - start) / | ||
1175 | pages_per_huge_page(hstate_vma(vma)); | ||
1176 | } | ||
1177 | 1363 | ||
1178 | start = end; | 1364 | start = end; |
1179 | } else | 1365 | } else |
1180 | start = unmap_page_range(*tlbp, vma, | 1366 | start = unmap_page_range(tlb, vma, start, end, details); |
1181 | start, end, &zap_work, details); | ||
1182 | |||
1183 | if (zap_work > 0) { | ||
1184 | BUG_ON(start != end); | ||
1185 | break; | ||
1186 | } | ||
1187 | |||
1188 | tlb_finish_mmu(*tlbp, tlb_start, start); | ||
1189 | |||
1190 | if (need_resched() || | ||
1191 | (i_mmap_lock && spin_needbreak(i_mmap_lock))) { | ||
1192 | if (i_mmap_lock) { | ||
1193 | *tlbp = NULL; | ||
1194 | goto out; | ||
1195 | } | ||
1196 | cond_resched(); | ||
1197 | } | ||
1198 | |||
1199 | *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); | ||
1200 | tlb_start_valid = 0; | ||
1201 | zap_work = ZAP_BLOCK_SIZE; | ||
1202 | } | 1367 | } |
1203 | } | 1368 | } |
1204 | out: | 1369 | |
1205 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | 1370 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); |
1206 | return start; /* which is now the end (or restart) address */ | 1371 | return start; /* which is now the end (or restart) address */ |
1207 | } | 1372 | } |
@@ -1217,16 +1382,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
1217 | unsigned long size, struct zap_details *details) | 1382 | unsigned long size, struct zap_details *details) |
1218 | { | 1383 | { |
1219 | struct mm_struct *mm = vma->vm_mm; | 1384 | struct mm_struct *mm = vma->vm_mm; |
1220 | struct mmu_gather *tlb; | 1385 | struct mmu_gather tlb; |
1221 | unsigned long end = address + size; | 1386 | unsigned long end = address + size; |
1222 | unsigned long nr_accounted = 0; | 1387 | unsigned long nr_accounted = 0; |
1223 | 1388 | ||
1224 | lru_add_drain(); | 1389 | lru_add_drain(); |
1225 | tlb = tlb_gather_mmu(mm, 0); | 1390 | tlb_gather_mmu(&tlb, mm, 0); |
1226 | update_hiwater_rss(mm); | 1391 | update_hiwater_rss(mm); |
1227 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); | 1392 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); |
1228 | if (tlb) | 1393 | tlb_finish_mmu(&tlb, address, end); |
1229 | tlb_finish_mmu(tlb, address, end); | ||
1230 | return end; | 1394 | return end; |
1231 | } | 1395 | } |
1232 | 1396 | ||
@@ -2535,96 +2699,11 @@ unwritable_page: | |||
2535 | return ret; | 2699 | return ret; |
2536 | } | 2700 | } |
2537 | 2701 | ||
2538 | /* | 2702 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
2539 | * Helper functions for unmap_mapping_range(). | ||
2540 | * | ||
2541 | * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ | ||
2542 | * | ||
2543 | * We have to restart searching the prio_tree whenever we drop the lock, | ||
2544 | * since the iterator is only valid while the lock is held, and anyway | ||
2545 | * a later vma might be split and reinserted earlier while lock dropped. | ||
2546 | * | ||
2547 | * The list of nonlinear vmas could be handled more efficiently, using | ||
2548 | * a placeholder, but handle it in the same way until a need is shown. | ||
2549 | * It is important to search the prio_tree before nonlinear list: a vma | ||
2550 | * may become nonlinear and be shifted from prio_tree to nonlinear list | ||
2551 | * while the lock is dropped; but never shifted from list to prio_tree. | ||
2552 | * | ||
2553 | * In order to make forward progress despite restarting the search, | ||
2554 | * vm_truncate_count is used to mark a vma as now dealt with, so we can | ||
2555 | * quickly skip it next time around. Since the prio_tree search only | ||
2556 | * shows us those vmas affected by unmapping the range in question, we | ||
2557 | * can't efficiently keep all vmas in step with mapping->truncate_count: | ||
2558 | * so instead reset them all whenever it wraps back to 0 (then go to 1). | ||
2559 | * mapping->truncate_count and vma->vm_truncate_count are protected by | ||
2560 | * i_mmap_lock. | ||
2561 | * | ||
2562 | * In order to make forward progress despite repeatedly restarting some | ||
2563 | * large vma, note the restart_addr from unmap_vmas when it breaks out: | ||
2564 | * and restart from that address when we reach that vma again. It might | ||
2565 | * have been split or merged, shrunk or extended, but never shifted: so | ||
2566 | * restart_addr remains valid so long as it remains in the vma's range. | ||
2567 | * unmap_mapping_range forces truncate_count to leap over page-aligned | ||
2568 | * values so we can save vma's restart_addr in its truncate_count field. | ||
2569 | */ | ||
2570 | #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) | ||
2571 | |||
2572 | static void reset_vma_truncate_counts(struct address_space *mapping) | ||
2573 | { | ||
2574 | struct vm_area_struct *vma; | ||
2575 | struct prio_tree_iter iter; | ||
2576 | |||
2577 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
2578 | vma->vm_truncate_count = 0; | ||
2579 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
2580 | vma->vm_truncate_count = 0; | ||
2581 | } | ||
2582 | |||
2583 | static int unmap_mapping_range_vma(struct vm_area_struct *vma, | ||
2584 | unsigned long start_addr, unsigned long end_addr, | 2703 | unsigned long start_addr, unsigned long end_addr, |
2585 | struct zap_details *details) | 2704 | struct zap_details *details) |
2586 | { | 2705 | { |
2587 | unsigned long restart_addr; | 2706 | zap_page_range(vma, start_addr, end_addr - start_addr, details); |
2588 | int need_break; | ||
2589 | |||
2590 | /* | ||
2591 | * files that support invalidating or truncating portions of the | ||
2592 | * file from under mmaped areas must have their ->fault function | ||
2593 | * return a locked page (and set VM_FAULT_LOCKED in the return). | ||
2594 | * This provides synchronisation against concurrent unmapping here. | ||
2595 | */ | ||
2596 | |||
2597 | again: | ||
2598 | restart_addr = vma->vm_truncate_count; | ||
2599 | if (is_restart_addr(restart_addr) && start_addr < restart_addr) { | ||
2600 | start_addr = restart_addr; | ||
2601 | if (start_addr >= end_addr) { | ||
2602 | /* Top of vma has been split off since last time */ | ||
2603 | vma->vm_truncate_count = details->truncate_count; | ||
2604 | return 0; | ||
2605 | } | ||
2606 | } | ||
2607 | |||
2608 | restart_addr = zap_page_range(vma, start_addr, | ||
2609 | end_addr - start_addr, details); | ||
2610 | need_break = need_resched() || spin_needbreak(details->i_mmap_lock); | ||
2611 | |||
2612 | if (restart_addr >= end_addr) { | ||
2613 | /* We have now completed this vma: mark it so */ | ||
2614 | vma->vm_truncate_count = details->truncate_count; | ||
2615 | if (!need_break) | ||
2616 | return 0; | ||
2617 | } else { | ||
2618 | /* Note restart_addr in vma's truncate_count field */ | ||
2619 | vma->vm_truncate_count = restart_addr; | ||
2620 | if (!need_break) | ||
2621 | goto again; | ||
2622 | } | ||
2623 | |||
2624 | spin_unlock(details->i_mmap_lock); | ||
2625 | cond_resched(); | ||
2626 | spin_lock(details->i_mmap_lock); | ||
2627 | return -EINTR; | ||
2628 | } | 2707 | } |
2629 | 2708 | ||
2630 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | 2709 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, |
@@ -2634,12 +2713,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | |||
2634 | struct prio_tree_iter iter; | 2713 | struct prio_tree_iter iter; |
2635 | pgoff_t vba, vea, zba, zea; | 2714 | pgoff_t vba, vea, zba, zea; |
2636 | 2715 | ||
2637 | restart: | ||
2638 | vma_prio_tree_foreach(vma, &iter, root, | 2716 | vma_prio_tree_foreach(vma, &iter, root, |
2639 | details->first_index, details->last_index) { | 2717 | details->first_index, details->last_index) { |
2640 | /* Skip quickly over those we have already dealt with */ | ||
2641 | if (vma->vm_truncate_count == details->truncate_count) | ||
2642 | continue; | ||
2643 | 2718 | ||
2644 | vba = vma->vm_pgoff; | 2719 | vba = vma->vm_pgoff; |
2645 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; | 2720 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; |
@@ -2651,11 +2726,10 @@ restart: | |||
2651 | if (zea > vea) | 2726 | if (zea > vea) |
2652 | zea = vea; | 2727 | zea = vea; |
2653 | 2728 | ||
2654 | if (unmap_mapping_range_vma(vma, | 2729 | unmap_mapping_range_vma(vma, |
2655 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, | 2730 | ((zba - vba) << PAGE_SHIFT) + vma->vm_start, |
2656 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, | 2731 | ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, |
2657 | details) < 0) | 2732 | details); |
2658 | goto restart; | ||
2659 | } | 2733 | } |
2660 | } | 2734 | } |
2661 | 2735 | ||
@@ -2670,15 +2744,9 @@ static inline void unmap_mapping_range_list(struct list_head *head, | |||
2670 | * across *all* the pages in each nonlinear VMA, not just the pages | 2744 | * across *all* the pages in each nonlinear VMA, not just the pages |
2671 | * whose virtual address lies outside the file truncation point. | 2745 | * whose virtual address lies outside the file truncation point. |
2672 | */ | 2746 | */ |
2673 | restart: | ||
2674 | list_for_each_entry(vma, head, shared.vm_set.list) { | 2747 | list_for_each_entry(vma, head, shared.vm_set.list) { |
2675 | /* Skip quickly over those we have already dealt with */ | ||
2676 | if (vma->vm_truncate_count == details->truncate_count) | ||
2677 | continue; | ||
2678 | details->nonlinear_vma = vma; | 2748 | details->nonlinear_vma = vma; |
2679 | if (unmap_mapping_range_vma(vma, vma->vm_start, | 2749 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); |
2680 | vma->vm_end, details) < 0) | ||
2681 | goto restart; | ||
2682 | } | 2750 | } |
2683 | } | 2751 | } |
2684 | 2752 | ||
@@ -2717,26 +2785,14 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2717 | details.last_index = hba + hlen - 1; | 2785 | details.last_index = hba + hlen - 1; |
2718 | if (details.last_index < details.first_index) | 2786 | if (details.last_index < details.first_index) |
2719 | details.last_index = ULONG_MAX; | 2787 | details.last_index = ULONG_MAX; |
2720 | details.i_mmap_lock = &mapping->i_mmap_lock; | ||
2721 | 2788 | ||
2722 | mutex_lock(&mapping->unmap_mutex); | ||
2723 | spin_lock(&mapping->i_mmap_lock); | ||
2724 | |||
2725 | /* Protect against endless unmapping loops */ | ||
2726 | mapping->truncate_count++; | ||
2727 | if (unlikely(is_restart_addr(mapping->truncate_count))) { | ||
2728 | if (mapping->truncate_count == 0) | ||
2729 | reset_vma_truncate_counts(mapping); | ||
2730 | mapping->truncate_count++; | ||
2731 | } | ||
2732 | details.truncate_count = mapping->truncate_count; | ||
2733 | 2789 | ||
2790 | mutex_lock(&mapping->i_mmap_mutex); | ||
2734 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) | 2791 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) |
2735 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2792 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2736 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2793 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2737 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2794 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
2738 | spin_unlock(&mapping->i_mmap_lock); | 2795 | mutex_unlock(&mapping->i_mmap_mutex); |
2739 | mutex_unlock(&mapping->unmap_mutex); | ||
2740 | } | 2796 | } |
2741 | EXPORT_SYMBOL(unmap_mapping_range); | 2797 | EXPORT_SYMBOL(unmap_mapping_range); |
2742 | 2798 | ||
@@ -2818,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2818 | /* Had to read the page from swap area: Major fault */ | 2874 | /* Had to read the page from swap area: Major fault */ |
2819 | ret = VM_FAULT_MAJOR; | 2875 | ret = VM_FAULT_MAJOR; |
2820 | count_vm_event(PGMAJFAULT); | 2876 | count_vm_event(PGMAJFAULT); |
2877 | mem_cgroup_count_vm_event(mm, PGMAJFAULT); | ||
2821 | } else if (PageHWPoison(page)) { | 2878 | } else if (PageHWPoison(page)) { |
2822 | /* | 2879 | /* |
2823 | * hwpoisoned dirty swapcache pages are kept for killing | 2880 | * hwpoisoned dirty swapcache pages are kept for killing |
@@ -2966,7 +3023,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2966 | if (prev && prev->vm_end == address) | 3023 | if (prev && prev->vm_end == address) |
2967 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; | 3024 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; |
2968 | 3025 | ||
2969 | expand_stack(vma, address - PAGE_SIZE); | 3026 | expand_downwards(vma, address - PAGE_SIZE); |
2970 | } | 3027 | } |
2971 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { | 3028 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { |
2972 | struct vm_area_struct *next = vma->vm_next; | 3029 | struct vm_area_struct *next = vma->vm_next; |
@@ -3357,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3357 | __set_current_state(TASK_RUNNING); | 3414 | __set_current_state(TASK_RUNNING); |
3358 | 3415 | ||
3359 | count_vm_event(PGFAULT); | 3416 | count_vm_event(PGFAULT); |
3417 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
3360 | 3418 | ||
3361 | /* do counter updates before entering really critical section. */ | 3419 | /* do counter updates before entering really critical section. */ |
3362 | check_sync_rss_stat(current); | 3420 | check_sync_rss_stat(current); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9ca1d604f7cd..9f646374e32f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -374,10 +374,6 @@ void online_page(struct page *page) | |||
374 | totalhigh_pages++; | 374 | totalhigh_pages++; |
375 | #endif | 375 | #endif |
376 | 376 | ||
377 | #ifdef CONFIG_FLATMEM | ||
378 | max_mapnr = max(pfn, max_mapnr); | ||
379 | #endif | ||
380 | |||
381 | ClearPageReserved(page); | 377 | ClearPageReserved(page); |
382 | init_page_count(page); | 378 | init_page_count(page); |
383 | __free_page(page); | 379 | __free_page(page); |
@@ -400,7 +396,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
400 | } | 396 | } |
401 | 397 | ||
402 | 398 | ||
403 | int online_pages(unsigned long pfn, unsigned long nr_pages) | 399 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages) |
404 | { | 400 | { |
405 | unsigned long onlined_pages = 0; | 401 | unsigned long onlined_pages = 0; |
406 | struct zone *zone; | 402 | struct zone *zone; |
@@ -459,8 +455,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
459 | zone_pcp_update(zone); | 455 | zone_pcp_update(zone); |
460 | 456 | ||
461 | mutex_unlock(&zonelists_mutex); | 457 | mutex_unlock(&zonelists_mutex); |
462 | setup_per_zone_wmarks(); | 458 | |
463 | calculate_zone_inactive_ratio(zone); | 459 | init_per_zone_wmark_min(); |
460 | |||
464 | if (onlined_pages) { | 461 | if (onlined_pages) { |
465 | kswapd_run(zone_to_nid(zone)); | 462 | kswapd_run(zone_to_nid(zone)); |
466 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 463 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
@@ -705,7 +702,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
705 | if (!pfn_valid(pfn)) | 702 | if (!pfn_valid(pfn)) |
706 | continue; | 703 | continue; |
707 | page = pfn_to_page(pfn); | 704 | page = pfn_to_page(pfn); |
708 | if (!page_count(page)) | 705 | if (!get_page_unless_zero(page)) |
709 | continue; | 706 | continue; |
710 | /* | 707 | /* |
711 | * We can skip free pages. And we can only deal with pages on | 708 | * We can skip free pages. And we can only deal with pages on |
@@ -713,6 +710,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
713 | */ | 710 | */ |
714 | ret = isolate_lru_page(page); | 711 | ret = isolate_lru_page(page); |
715 | if (!ret) { /* Success */ | 712 | if (!ret) { /* Success */ |
713 | put_page(page); | ||
716 | list_add_tail(&page->lru, &source); | 714 | list_add_tail(&page->lru, &source); |
717 | move_pages--; | 715 | move_pages--; |
718 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 716 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
@@ -724,6 +722,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
724 | pfn); | 722 | pfn); |
725 | dump_page(page); | 723 | dump_page(page); |
726 | #endif | 724 | #endif |
725 | put_page(page); | ||
727 | /* Because we don't have big zone->lock. we should | 726 | /* Because we don't have big zone->lock. we should |
728 | check this again here. */ | 727 | check this again here. */ |
729 | if (page_count(page)) { | 728 | if (page_count(page)) { |
@@ -795,7 +794,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
795 | return offlined; | 794 | return offlined; |
796 | } | 795 | } |
797 | 796 | ||
798 | static int offline_pages(unsigned long start_pfn, | 797 | static int __ref offline_pages(unsigned long start_pfn, |
799 | unsigned long end_pfn, unsigned long timeout) | 798 | unsigned long end_pfn, unsigned long timeout) |
800 | { | 799 | { |
801 | unsigned long pfn, nr_pages, expire; | 800 | unsigned long pfn, nr_pages, expire; |
@@ -893,8 +892,8 @@ repeat: | |||
893 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 892 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
894 | totalram_pages -= offlined_pages; | 893 | totalram_pages -= offlined_pages; |
895 | 894 | ||
896 | setup_per_zone_wmarks(); | 895 | init_per_zone_wmark_min(); |
897 | calculate_zone_inactive_ratio(zone); | 896 | |
898 | if (!node_present_pages(node)) { | 897 | if (!node_present_pages(node)) { |
899 | node_clear_state(node, N_HIGH_MEMORY); | 898 | node_clear_state(node, N_HIGH_MEMORY); |
900 | kswapd_stop(node); | 899 | kswapd_stop(node); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 959a8b8c7350..e7fb9d25c54e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -99,7 +99,6 @@ | |||
99 | /* Internal flags */ | 99 | /* Internal flags */ |
100 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | 100 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ |
101 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 101 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
102 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | ||
103 | 102 | ||
104 | static struct kmem_cache *policy_cache; | 103 | static struct kmem_cache *policy_cache; |
105 | static struct kmem_cache *sn_cache; | 104 | static struct kmem_cache *sn_cache; |
@@ -457,7 +456,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | |||
457 | }, | 456 | }, |
458 | }; | 457 | }; |
459 | 458 | ||
460 | static void gather_stats(struct page *, void *, int pte_dirty); | ||
461 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 459 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
462 | unsigned long flags); | 460 | unsigned long flags); |
463 | 461 | ||
@@ -492,9 +490,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
492 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 490 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
493 | continue; | 491 | continue; |
494 | 492 | ||
495 | if (flags & MPOL_MF_STATS) | 493 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
496 | gather_stats(page, private, pte_dirty(*pte)); | ||
497 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
498 | migrate_page_add(page, private, flags); | 494 | migrate_page_add(page, private, flags); |
499 | else | 495 | else |
500 | break; | 496 | break; |
@@ -1489,7 +1485,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1489 | * freeing by another task. It is the caller's responsibility to free the | 1485 | * freeing by another task. It is the caller's responsibility to free the |
1490 | * extra reference for shared policies. | 1486 | * extra reference for shared policies. |
1491 | */ | 1487 | */ |
1492 | static struct mempolicy *get_vma_policy(struct task_struct *task, | 1488 | struct mempolicy *get_vma_policy(struct task_struct *task, |
1493 | struct vm_area_struct *vma, unsigned long addr) | 1489 | struct vm_area_struct *vma, unsigned long addr) |
1494 | { | 1490 | { |
1495 | struct mempolicy *pol = task->mempolicy; | 1491 | struct mempolicy *pol = task->mempolicy; |
@@ -2529,159 +2525,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) | |||
2529 | } | 2525 | } |
2530 | return p - buffer; | 2526 | return p - buffer; |
2531 | } | 2527 | } |
2532 | |||
2533 | struct numa_maps { | ||
2534 | unsigned long pages; | ||
2535 | unsigned long anon; | ||
2536 | unsigned long active; | ||
2537 | unsigned long writeback; | ||
2538 | unsigned long mapcount_max; | ||
2539 | unsigned long dirty; | ||
2540 | unsigned long swapcache; | ||
2541 | unsigned long node[MAX_NUMNODES]; | ||
2542 | }; | ||
2543 | |||
2544 | static void gather_stats(struct page *page, void *private, int pte_dirty) | ||
2545 | { | ||
2546 | struct numa_maps *md = private; | ||
2547 | int count = page_mapcount(page); | ||
2548 | |||
2549 | md->pages++; | ||
2550 | if (pte_dirty || PageDirty(page)) | ||
2551 | md->dirty++; | ||
2552 | |||
2553 | if (PageSwapCache(page)) | ||
2554 | md->swapcache++; | ||
2555 | |||
2556 | if (PageActive(page) || PageUnevictable(page)) | ||
2557 | md->active++; | ||
2558 | |||
2559 | if (PageWriteback(page)) | ||
2560 | md->writeback++; | ||
2561 | |||
2562 | if (PageAnon(page)) | ||
2563 | md->anon++; | ||
2564 | |||
2565 | if (count > md->mapcount_max) | ||
2566 | md->mapcount_max = count; | ||
2567 | |||
2568 | md->node[page_to_nid(page)]++; | ||
2569 | } | ||
2570 | |||
2571 | #ifdef CONFIG_HUGETLB_PAGE | ||
2572 | static void check_huge_range(struct vm_area_struct *vma, | ||
2573 | unsigned long start, unsigned long end, | ||
2574 | struct numa_maps *md) | ||
2575 | { | ||
2576 | unsigned long addr; | ||
2577 | struct page *page; | ||
2578 | struct hstate *h = hstate_vma(vma); | ||
2579 | unsigned long sz = huge_page_size(h); | ||
2580 | |||
2581 | for (addr = start; addr < end; addr += sz) { | ||
2582 | pte_t *ptep = huge_pte_offset(vma->vm_mm, | ||
2583 | addr & huge_page_mask(h)); | ||
2584 | pte_t pte; | ||
2585 | |||
2586 | if (!ptep) | ||
2587 | continue; | ||
2588 | |||
2589 | pte = *ptep; | ||
2590 | if (pte_none(pte)) | ||
2591 | continue; | ||
2592 | |||
2593 | page = pte_page(pte); | ||
2594 | if (!page) | ||
2595 | continue; | ||
2596 | |||
2597 | gather_stats(page, md, pte_dirty(*ptep)); | ||
2598 | } | ||
2599 | } | ||
2600 | #else | ||
2601 | static inline void check_huge_range(struct vm_area_struct *vma, | ||
2602 | unsigned long start, unsigned long end, | ||
2603 | struct numa_maps *md) | ||
2604 | { | ||
2605 | } | ||
2606 | #endif | ||
2607 | |||
2608 | /* | ||
2609 | * Display pages allocated per node and memory policy via /proc. | ||
2610 | */ | ||
2611 | int show_numa_map(struct seq_file *m, void *v) | ||
2612 | { | ||
2613 | struct proc_maps_private *priv = m->private; | ||
2614 | struct vm_area_struct *vma = v; | ||
2615 | struct numa_maps *md; | ||
2616 | struct file *file = vma->vm_file; | ||
2617 | struct mm_struct *mm = vma->vm_mm; | ||
2618 | struct mempolicy *pol; | ||
2619 | int n; | ||
2620 | char buffer[50]; | ||
2621 | |||
2622 | if (!mm) | ||
2623 | return 0; | ||
2624 | |||
2625 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); | ||
2626 | if (!md) | ||
2627 | return 0; | ||
2628 | |||
2629 | pol = get_vma_policy(priv->task, vma, vma->vm_start); | ||
2630 | mpol_to_str(buffer, sizeof(buffer), pol, 0); | ||
2631 | mpol_cond_put(pol); | ||
2632 | |||
2633 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | ||
2634 | |||
2635 | if (file) { | ||
2636 | seq_printf(m, " file="); | ||
2637 | seq_path(m, &file->f_path, "\n\t= "); | ||
2638 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | ||
2639 | seq_printf(m, " heap"); | ||
2640 | } else if (vma->vm_start <= mm->start_stack && | ||
2641 | vma->vm_end >= mm->start_stack) { | ||
2642 | seq_printf(m, " stack"); | ||
2643 | } | ||
2644 | |||
2645 | if (is_vm_hugetlb_page(vma)) { | ||
2646 | check_huge_range(vma, vma->vm_start, vma->vm_end, md); | ||
2647 | seq_printf(m, " huge"); | ||
2648 | } else { | ||
2649 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | ||
2650 | &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); | ||
2651 | } | ||
2652 | |||
2653 | if (!md->pages) | ||
2654 | goto out; | ||
2655 | |||
2656 | if (md->anon) | ||
2657 | seq_printf(m," anon=%lu",md->anon); | ||
2658 | |||
2659 | if (md->dirty) | ||
2660 | seq_printf(m," dirty=%lu",md->dirty); | ||
2661 | |||
2662 | if (md->pages != md->anon && md->pages != md->dirty) | ||
2663 | seq_printf(m, " mapped=%lu", md->pages); | ||
2664 | |||
2665 | if (md->mapcount_max > 1) | ||
2666 | seq_printf(m, " mapmax=%lu", md->mapcount_max); | ||
2667 | |||
2668 | if (md->swapcache) | ||
2669 | seq_printf(m," swapcache=%lu", md->swapcache); | ||
2670 | |||
2671 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) | ||
2672 | seq_printf(m," active=%lu", md->active); | ||
2673 | |||
2674 | if (md->writeback) | ||
2675 | seq_printf(m," writeback=%lu", md->writeback); | ||
2676 | |||
2677 | for_each_node_state(n, N_HIGH_MEMORY) | ||
2678 | if (md->node[n]) | ||
2679 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
2680 | out: | ||
2681 | seq_putc(m, '\n'); | ||
2682 | kfree(md); | ||
2683 | |||
2684 | if (m->count < m->size) | ||
2685 | m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; | ||
2686 | return 0; | ||
2687 | } | ||
diff --git a/mm/migrate.c b/mm/migrate.c index 34132f8e9109..e4a5c912983d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -721,15 +721,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
721 | * Only page_lock_anon_vma() understands the subtleties of | 721 | * Only page_lock_anon_vma() understands the subtleties of |
722 | * getting a hold on an anon_vma from outside one of its mms. | 722 | * getting a hold on an anon_vma from outside one of its mms. |
723 | */ | 723 | */ |
724 | anon_vma = page_lock_anon_vma(page); | 724 | anon_vma = page_get_anon_vma(page); |
725 | if (anon_vma) { | 725 | if (anon_vma) { |
726 | /* | 726 | /* |
727 | * Take a reference count on the anon_vma if the | 727 | * Anon page |
728 | * page is mapped so that it is guaranteed to | ||
729 | * exist when the page is remapped later | ||
730 | */ | 728 | */ |
731 | get_anon_vma(anon_vma); | ||
732 | page_unlock_anon_vma(anon_vma); | ||
733 | } else if (PageSwapCache(page)) { | 729 | } else if (PageSwapCache(page)) { |
734 | /* | 730 | /* |
735 | * We cannot be sure that the anon_vma of an unmapped | 731 | * We cannot be sure that the anon_vma of an unmapped |
@@ -857,13 +853,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
857 | lock_page(hpage); | 853 | lock_page(hpage); |
858 | } | 854 | } |
859 | 855 | ||
860 | if (PageAnon(hpage)) { | 856 | if (PageAnon(hpage)) |
861 | anon_vma = page_lock_anon_vma(hpage); | 857 | anon_vma = page_get_anon_vma(hpage); |
862 | if (anon_vma) { | ||
863 | get_anon_vma(anon_vma); | ||
864 | page_unlock_anon_vma(anon_vma); | ||
865 | } | ||
866 | } | ||
867 | 858 | ||
868 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 859 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
869 | 860 | ||
diff --git a/mm/mlock.c b/mm/mlock.c index 516b2c2ddd5a..048260c4e02e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
307 | * For vmas that pass the filters, merge/split as appropriate. | 307 | * For vmas that pass the filters, merge/split as appropriate. |
308 | */ | 308 | */ |
309 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | 309 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, |
310 | unsigned long start, unsigned long end, unsigned int newflags) | 310 | unsigned long start, unsigned long end, vm_flags_t newflags) |
311 | { | 311 | { |
312 | struct mm_struct *mm = vma->vm_mm; | 312 | struct mm_struct *mm = vma->vm_mm; |
313 | pgoff_t pgoff; | 313 | pgoff_t pgoff; |
314 | int nr_pages; | 314 | int nr_pages; |
315 | int ret = 0; | 315 | int ret = 0; |
316 | int lock = newflags & VM_LOCKED; | 316 | int lock = !!(newflags & VM_LOCKED); |
317 | 317 | ||
318 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || | 318 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
319 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) | 319 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) |
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
385 | prev = vma; | 385 | prev = vma; |
386 | 386 | ||
387 | for (nstart = start ; ; ) { | 387 | for (nstart = start ; ; ) { |
388 | unsigned int newflags; | 388 | vm_flags_t newflags; |
389 | 389 | ||
390 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 390 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
391 | 391 | ||
@@ -524,7 +524,7 @@ static int do_mlockall(int flags) | |||
524 | goto out; | 524 | goto out; |
525 | 525 | ||
526 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { | 526 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { |
527 | unsigned int newflags; | 527 | vm_flags_t newflags; |
528 | 528 | ||
529 | newflags = vma->vm_flags | VM_LOCKED; | 529 | newflags = vma->vm_flags | VM_LOCKED; |
530 | if (!(flags & MCL_CURRENT)) | 530 | if (!(flags & MCL_CURRENT)) |
@@ -84,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) | |||
84 | } | 84 | } |
85 | EXPORT_SYMBOL(vm_get_page_prot); | 85 | EXPORT_SYMBOL(vm_get_page_prot); |
86 | 86 | ||
87 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
88 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
90 | struct percpu_counter vm_committed_as; | 90 | /* |
91 | * Make sure vm_committed_as in one cacheline and not cacheline shared with | ||
92 | * other variables. It can be updated by several CPUs frequently. | ||
93 | */ | ||
94 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; | ||
91 | 95 | ||
92 | /* | 96 | /* |
93 | * Check that a process has enough memory to allocate a new virtual | 97 | * Check that a process has enough memory to allocate a new virtual |
@@ -190,7 +194,7 @@ error: | |||
190 | } | 194 | } |
191 | 195 | ||
192 | /* | 196 | /* |
193 | * Requires inode->i_mapping->i_mmap_lock | 197 | * Requires inode->i_mapping->i_mmap_mutex |
194 | */ | 198 | */ |
195 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 199 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
196 | struct file *file, struct address_space *mapping) | 200 | struct file *file, struct address_space *mapping) |
@@ -218,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma) | |||
218 | 222 | ||
219 | if (file) { | 223 | if (file) { |
220 | struct address_space *mapping = file->f_mapping; | 224 | struct address_space *mapping = file->f_mapping; |
221 | spin_lock(&mapping->i_mmap_lock); | 225 | mutex_lock(&mapping->i_mmap_mutex); |
222 | __remove_shared_vm_struct(vma, file, mapping); | 226 | __remove_shared_vm_struct(vma, file, mapping); |
223 | spin_unlock(&mapping->i_mmap_lock); | 227 | mutex_unlock(&mapping->i_mmap_mutex); |
224 | } | 228 | } |
225 | } | 229 | } |
226 | 230 | ||
@@ -394,29 +398,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
394 | return vma; | 398 | return vma; |
395 | } | 399 | } |
396 | 400 | ||
397 | static inline void | ||
398 | __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | ||
399 | struct vm_area_struct *prev, struct rb_node *rb_parent) | ||
400 | { | ||
401 | struct vm_area_struct *next; | ||
402 | |||
403 | vma->vm_prev = prev; | ||
404 | if (prev) { | ||
405 | next = prev->vm_next; | ||
406 | prev->vm_next = vma; | ||
407 | } else { | ||
408 | mm->mmap = vma; | ||
409 | if (rb_parent) | ||
410 | next = rb_entry(rb_parent, | ||
411 | struct vm_area_struct, vm_rb); | ||
412 | else | ||
413 | next = NULL; | ||
414 | } | ||
415 | vma->vm_next = next; | ||
416 | if (next) | ||
417 | next->vm_prev = vma; | ||
418 | } | ||
419 | |||
420 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 401 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
421 | struct rb_node **rb_link, struct rb_node *rb_parent) | 402 | struct rb_node **rb_link, struct rb_node *rb_parent) |
422 | { | 403 | { |
@@ -464,16 +445,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
464 | if (vma->vm_file) | 445 | if (vma->vm_file) |
465 | mapping = vma->vm_file->f_mapping; | 446 | mapping = vma->vm_file->f_mapping; |
466 | 447 | ||
467 | if (mapping) { | 448 | if (mapping) |
468 | spin_lock(&mapping->i_mmap_lock); | 449 | mutex_lock(&mapping->i_mmap_mutex); |
469 | vma->vm_truncate_count = mapping->truncate_count; | ||
470 | } | ||
471 | 450 | ||
472 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 451 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
473 | __vma_link_file(vma); | 452 | __vma_link_file(vma); |
474 | 453 | ||
475 | if (mapping) | 454 | if (mapping) |
476 | spin_unlock(&mapping->i_mmap_lock); | 455 | mutex_unlock(&mapping->i_mmap_mutex); |
477 | 456 | ||
478 | mm->map_count++; | 457 | mm->map_count++; |
479 | validate_mm(mm); | 458 | validate_mm(mm); |
@@ -576,17 +555,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
576 | mapping = file->f_mapping; | 555 | mapping = file->f_mapping; |
577 | if (!(vma->vm_flags & VM_NONLINEAR)) | 556 | if (!(vma->vm_flags & VM_NONLINEAR)) |
578 | root = &mapping->i_mmap; | 557 | root = &mapping->i_mmap; |
579 | spin_lock(&mapping->i_mmap_lock); | 558 | mutex_lock(&mapping->i_mmap_mutex); |
580 | if (importer && | ||
581 | vma->vm_truncate_count != next->vm_truncate_count) { | ||
582 | /* | ||
583 | * unmap_mapping_range might be in progress: | ||
584 | * ensure that the expanding vma is rescanned. | ||
585 | */ | ||
586 | importer->vm_truncate_count = 0; | ||
587 | } | ||
588 | if (insert) { | 559 | if (insert) { |
589 | insert->vm_truncate_count = vma->vm_truncate_count; | ||
590 | /* | 560 | /* |
591 | * Put into prio_tree now, so instantiated pages | 561 | * Put into prio_tree now, so instantiated pages |
592 | * are visible to arm/parisc __flush_dcache_page | 562 | * are visible to arm/parisc __flush_dcache_page |
@@ -605,7 +575,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
605 | * lock may be shared between many sibling processes. Skipping | 575 | * lock may be shared between many sibling processes. Skipping |
606 | * the lock for brk adjustments makes a difference sometimes. | 576 | * the lock for brk adjustments makes a difference sometimes. |
607 | */ | 577 | */ |
608 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { | 578 | if (vma->anon_vma && (importer || start != vma->vm_start)) { |
609 | anon_vma = vma->anon_vma; | 579 | anon_vma = vma->anon_vma; |
610 | anon_vma_lock(anon_vma); | 580 | anon_vma_lock(anon_vma); |
611 | } | 581 | } |
@@ -652,7 +622,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
652 | if (anon_vma) | 622 | if (anon_vma) |
653 | anon_vma_unlock(anon_vma); | 623 | anon_vma_unlock(anon_vma); |
654 | if (mapping) | 624 | if (mapping) |
655 | spin_unlock(&mapping->i_mmap_lock); | 625 | mutex_unlock(&mapping->i_mmap_mutex); |
656 | 626 | ||
657 | if (remove_next) { | 627 | if (remove_next) { |
658 | if (file) { | 628 | if (file) { |
@@ -699,9 +669,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, | |||
699 | } | 669 | } |
700 | 670 | ||
701 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, | 671 | static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, |
702 | struct anon_vma *anon_vma2) | 672 | struct anon_vma *anon_vma2, |
673 | struct vm_area_struct *vma) | ||
703 | { | 674 | { |
704 | return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); | 675 | /* |
676 | * The list_is_singular() test is to avoid merging VMA cloned from | ||
677 | * parents. This can improve scalability caused by anon_vma lock. | ||
678 | */ | ||
679 | if ((!anon_vma1 || !anon_vma2) && (!vma || | ||
680 | list_is_singular(&vma->anon_vma_chain))) | ||
681 | return 1; | ||
682 | return anon_vma1 == anon_vma2; | ||
705 | } | 683 | } |
706 | 684 | ||
707 | /* | 685 | /* |
@@ -720,7 +698,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, | |||
720 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 698 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
721 | { | 699 | { |
722 | if (is_mergeable_vma(vma, file, vm_flags) && | 700 | if (is_mergeable_vma(vma, file, vm_flags) && |
723 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { | 701 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
724 | if (vma->vm_pgoff == vm_pgoff) | 702 | if (vma->vm_pgoff == vm_pgoff) |
725 | return 1; | 703 | return 1; |
726 | } | 704 | } |
@@ -739,7 +717,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | |||
739 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) | 717 | struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) |
740 | { | 718 | { |
741 | if (is_mergeable_vma(vma, file, vm_flags) && | 719 | if (is_mergeable_vma(vma, file, vm_flags) && |
742 | is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { | 720 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
743 | pgoff_t vm_pglen; | 721 | pgoff_t vm_pglen; |
744 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 722 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; |
745 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) | 723 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) |
@@ -817,7 +795,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
817 | can_vma_merge_before(next, vm_flags, | 795 | can_vma_merge_before(next, vm_flags, |
818 | anon_vma, file, pgoff+pglen) && | 796 | anon_vma, file, pgoff+pglen) && |
819 | is_mergeable_anon_vma(prev->anon_vma, | 797 | is_mergeable_anon_vma(prev->anon_vma, |
820 | next->anon_vma)) { | 798 | next->anon_vma, NULL)) { |
821 | /* cases 1, 6 */ | 799 | /* cases 1, 6 */ |
822 | err = vma_adjust(prev, prev->vm_start, | 800 | err = vma_adjust(prev, prev->vm_start, |
823 | next->vm_end, prev->vm_pgoff, NULL); | 801 | next->vm_end, prev->vm_pgoff, NULL); |
@@ -982,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
982 | { | 960 | { |
983 | struct mm_struct * mm = current->mm; | 961 | struct mm_struct * mm = current->mm; |
984 | struct inode *inode; | 962 | struct inode *inode; |
985 | unsigned int vm_flags; | 963 | vm_flags_t vm_flags; |
986 | int error; | 964 | int error; |
987 | unsigned long reqprot = prot; | 965 | unsigned long reqprot = prot; |
988 | 966 | ||
@@ -1187,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | |||
1187 | */ | 1165 | */ |
1188 | int vma_wants_writenotify(struct vm_area_struct *vma) | 1166 | int vma_wants_writenotify(struct vm_area_struct *vma) |
1189 | { | 1167 | { |
1190 | unsigned int vm_flags = vma->vm_flags; | 1168 | vm_flags_t vm_flags = vma->vm_flags; |
1191 | 1169 | ||
1192 | /* If it was private or non-writable, the write bit is already clear */ | 1170 | /* If it was private or non-writable, the write bit is already clear */ |
1193 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) | 1171 | if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) |
@@ -1215,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) | |||
1215 | * We account for memory if it's a private writeable mapping, | 1193 | * We account for memory if it's a private writeable mapping, |
1216 | * not hugepages and VM_NORESERVE wasn't set. | 1194 | * not hugepages and VM_NORESERVE wasn't set. |
1217 | */ | 1195 | */ |
1218 | static inline int accountable_mapping(struct file *file, unsigned int vm_flags) | 1196 | static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) |
1219 | { | 1197 | { |
1220 | /* | 1198 | /* |
1221 | * hugetlb has its own accounting separate from the core VM | 1199 | * hugetlb has its own accounting separate from the core VM |
@@ -1229,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags) | |||
1229 | 1207 | ||
1230 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1208 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1231 | unsigned long len, unsigned long flags, | 1209 | unsigned long len, unsigned long flags, |
1232 | unsigned int vm_flags, unsigned long pgoff) | 1210 | vm_flags_t vm_flags, unsigned long pgoff) |
1233 | { | 1211 | { |
1234 | struct mm_struct *mm = current->mm; | 1212 | struct mm_struct *mm = current->mm; |
1235 | struct vm_area_struct *vma, *prev; | 1213 | struct vm_area_struct *vma, *prev; |
@@ -1785,7 +1763,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1785 | /* | 1763 | /* |
1786 | * vma is the first one with address < vma->vm_start. Have to extend vma. | 1764 | * vma is the first one with address < vma->vm_start. Have to extend vma. |
1787 | */ | 1765 | */ |
1788 | static int expand_downwards(struct vm_area_struct *vma, | 1766 | int expand_downwards(struct vm_area_struct *vma, |
1789 | unsigned long address) | 1767 | unsigned long address) |
1790 | { | 1768 | { |
1791 | int error; | 1769 | int error; |
@@ -1832,11 +1810,6 @@ static int expand_downwards(struct vm_area_struct *vma, | |||
1832 | return error; | 1810 | return error; |
1833 | } | 1811 | } |
1834 | 1812 | ||
1835 | int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address) | ||
1836 | { | ||
1837 | return expand_downwards(vma, address); | ||
1838 | } | ||
1839 | |||
1840 | #ifdef CONFIG_STACK_GROWSUP | 1813 | #ifdef CONFIG_STACK_GROWSUP |
1841 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | 1814 | int expand_stack(struct vm_area_struct *vma, unsigned long address) |
1842 | { | 1815 | { |
@@ -1919,17 +1892,17 @@ static void unmap_region(struct mm_struct *mm, | |||
1919 | unsigned long start, unsigned long end) | 1892 | unsigned long start, unsigned long end) |
1920 | { | 1893 | { |
1921 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | 1894 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; |
1922 | struct mmu_gather *tlb; | 1895 | struct mmu_gather tlb; |
1923 | unsigned long nr_accounted = 0; | 1896 | unsigned long nr_accounted = 0; |
1924 | 1897 | ||
1925 | lru_add_drain(); | 1898 | lru_add_drain(); |
1926 | tlb = tlb_gather_mmu(mm, 0); | 1899 | tlb_gather_mmu(&tlb, mm, 0); |
1927 | update_hiwater_rss(mm); | 1900 | update_hiwater_rss(mm); |
1928 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | 1901 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); |
1929 | vm_unacct_memory(nr_accounted); | 1902 | vm_unacct_memory(nr_accounted); |
1930 | free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, | 1903 | free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, |
1931 | next? next->vm_start: 0); | 1904 | next ? next->vm_start : 0); |
1932 | tlb_finish_mmu(tlb, start, end); | 1905 | tlb_finish_mmu(&tlb, start, end); |
1933 | } | 1906 | } |
1934 | 1907 | ||
1935 | /* | 1908 | /* |
@@ -2271,7 +2244,7 @@ EXPORT_SYMBOL(do_brk); | |||
2271 | /* Release all mmaps. */ | 2244 | /* Release all mmaps. */ |
2272 | void exit_mmap(struct mm_struct *mm) | 2245 | void exit_mmap(struct mm_struct *mm) |
2273 | { | 2246 | { |
2274 | struct mmu_gather *tlb; | 2247 | struct mmu_gather tlb; |
2275 | struct vm_area_struct *vma; | 2248 | struct vm_area_struct *vma; |
2276 | unsigned long nr_accounted = 0; | 2249 | unsigned long nr_accounted = 0; |
2277 | unsigned long end; | 2250 | unsigned long end; |
@@ -2296,14 +2269,14 @@ void exit_mmap(struct mm_struct *mm) | |||
2296 | 2269 | ||
2297 | lru_add_drain(); | 2270 | lru_add_drain(); |
2298 | flush_cache_mm(mm); | 2271 | flush_cache_mm(mm); |
2299 | tlb = tlb_gather_mmu(mm, 1); | 2272 | tlb_gather_mmu(&tlb, mm, 1); |
2300 | /* update_hiwater_rss(mm) here? but nobody should be looking */ | 2273 | /* update_hiwater_rss(mm) here? but nobody should be looking */ |
2301 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2274 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
2302 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2275 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
2303 | vm_unacct_memory(nr_accounted); | 2276 | vm_unacct_memory(nr_accounted); |
2304 | 2277 | ||
2305 | free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); | 2278 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); |
2306 | tlb_finish_mmu(tlb, 0, end); | 2279 | tlb_finish_mmu(&tlb, 0, end); |
2307 | 2280 | ||
2308 | /* | 2281 | /* |
2309 | * Walk the list again, actually closing and freeing it, | 2282 | * Walk the list again, actually closing and freeing it, |
@@ -2317,7 +2290,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2317 | 2290 | ||
2318 | /* Insert vm structure into process list sorted by address | 2291 | /* Insert vm structure into process list sorted by address |
2319 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2292 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2320 | * then i_mmap_lock is taken here. | 2293 | * then i_mmap_mutex is taken here. |
2321 | */ | 2294 | */ |
2322 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | 2295 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) |
2323 | { | 2296 | { |
@@ -2529,15 +2502,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2529 | * The LSB of head.next can't change from under us | 2502 | * The LSB of head.next can't change from under us |
2530 | * because we hold the mm_all_locks_mutex. | 2503 | * because we hold the mm_all_locks_mutex. |
2531 | */ | 2504 | */ |
2532 | spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); | 2505 | mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); |
2533 | /* | 2506 | /* |
2534 | * We can safely modify head.next after taking the | 2507 | * We can safely modify head.next after taking the |
2535 | * anon_vma->root->lock. If some other vma in this mm shares | 2508 | * anon_vma->root->mutex. If some other vma in this mm shares |
2536 | * the same anon_vma we won't take it again. | 2509 | * the same anon_vma we won't take it again. |
2537 | * | 2510 | * |
2538 | * No need of atomic instructions here, head.next | 2511 | * No need of atomic instructions here, head.next |
2539 | * can't change from under us thanks to the | 2512 | * can't change from under us thanks to the |
2540 | * anon_vma->root->lock. | 2513 | * anon_vma->root->mutex. |
2541 | */ | 2514 | */ |
2542 | if (__test_and_set_bit(0, (unsigned long *) | 2515 | if (__test_and_set_bit(0, (unsigned long *) |
2543 | &anon_vma->root->head.next)) | 2516 | &anon_vma->root->head.next)) |
@@ -2559,7 +2532,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2559 | */ | 2532 | */ |
2560 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 2533 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
2561 | BUG(); | 2534 | BUG(); |
2562 | spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); | 2535 | mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); |
2563 | } | 2536 | } |
2564 | } | 2537 | } |
2565 | 2538 | ||
@@ -2586,7 +2559,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2586 | * vma in this mm is backed by the same anon_vma or address_space. | 2559 | * vma in this mm is backed by the same anon_vma or address_space. |
2587 | * | 2560 | * |
2588 | * We can take all the locks in random order because the VM code | 2561 | * We can take all the locks in random order because the VM code |
2589 | * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never | 2562 | * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never |
2590 | * takes more than one of them in a row. Secondly we're protected | 2563 | * takes more than one of them in a row. Secondly we're protected |
2591 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 2564 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
2592 | * | 2565 | * |
@@ -2642,7 +2615,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2642 | * | 2615 | * |
2643 | * No need of atomic instructions here, head.next | 2616 | * No need of atomic instructions here, head.next |
2644 | * can't change from under us until we release the | 2617 | * can't change from under us until we release the |
2645 | * anon_vma->root->lock. | 2618 | * anon_vma->root->mutex. |
2646 | */ | 2619 | */ |
2647 | if (!__test_and_clear_bit(0, (unsigned long *) | 2620 | if (!__test_and_clear_bit(0, (unsigned long *) |
2648 | &anon_vma->root->head.next)) | 2621 | &anon_vma->root->head.next)) |
@@ -2658,7 +2631,7 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
2658 | * AS_MM_ALL_LOCKS can't change to 0 from under us | 2631 | * AS_MM_ALL_LOCKS can't change to 0 from under us |
2659 | * because we hold the mm_all_locks_mutex. | 2632 | * because we hold the mm_all_locks_mutex. |
2660 | */ | 2633 | */ |
2661 | spin_unlock(&mapping->i_mmap_lock); | 2634 | mutex_unlock(&mapping->i_mmap_mutex); |
2662 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | 2635 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, |
2663 | &mapping->flags)) | 2636 | &mapping->flags)) |
2664 | BUG(); | 2637 | BUG(); |
diff --git a/mm/mremap.c b/mm/mremap.c index a7c1f9f9b941..506fa44403df 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -93,8 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
93 | * and we propagate stale pages into the dst afterward. | 93 | * and we propagate stale pages into the dst afterward. |
94 | */ | 94 | */ |
95 | mapping = vma->vm_file->f_mapping; | 95 | mapping = vma->vm_file->f_mapping; |
96 | spin_lock(&mapping->i_mmap_lock); | 96 | mutex_lock(&mapping->i_mmap_mutex); |
97 | new_vma->vm_truncate_count = 0; | ||
98 | } | 97 | } |
99 | 98 | ||
100 | /* | 99 | /* |
@@ -123,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
123 | pte_unmap(new_pte - 1); | 122 | pte_unmap(new_pte - 1); |
124 | pte_unmap_unlock(old_pte - 1, old_ptl); | 123 | pte_unmap_unlock(old_pte - 1, old_ptl); |
125 | if (mapping) | 124 | if (mapping) |
126 | spin_unlock(&mapping->i_mmap_lock); | 125 | mutex_unlock(&mapping->i_mmap_mutex); |
127 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); | 126 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); |
128 | } | 127 | } |
129 | 128 | ||
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 9109049f0bbc..6e93dc7f2586 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -307,30 +307,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
307 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 307 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
308 | unsigned long align, unsigned long goal) | 308 | unsigned long align, unsigned long goal) |
309 | { | 309 | { |
310 | #ifdef MAX_DMA32_PFN | ||
311 | unsigned long end_pfn; | ||
312 | |||
313 | if (WARN_ON_ONCE(slab_is_available())) | ||
314 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
315 | |||
316 | /* update goal according ...MAX_DMA32_PFN */ | ||
317 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
318 | |||
319 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | ||
320 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | ||
321 | void *ptr; | ||
322 | unsigned long new_goal; | ||
323 | |||
324 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | ||
325 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
326 | new_goal, -1ULL); | ||
327 | if (ptr) | ||
328 | return ptr; | ||
329 | } | ||
330 | #endif | ||
331 | |||
332 | return __alloc_bootmem_node(pgdat, size, align, goal); | 310 | return __alloc_bootmem_node(pgdat, size, align, goal); |
333 | |||
334 | } | 311 | } |
335 | 312 | ||
336 | #ifdef CONFIG_SPARSEMEM | 313 | #ifdef CONFIG_SPARSEMEM |
diff --git a/mm/nommu.c b/mm/nommu.c index c4c542c736a9..1fd0c51b10a6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -680,9 +680,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags) | |||
680 | */ | 680 | */ |
681 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | 681 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) |
682 | { | 682 | { |
683 | struct vm_area_struct *pvma, **pp, *next; | 683 | struct vm_area_struct *pvma, *prev; |
684 | struct address_space *mapping; | 684 | struct address_space *mapping; |
685 | struct rb_node **p, *parent; | 685 | struct rb_node **p, *parent, *rb_prev; |
686 | 686 | ||
687 | kenter(",%p", vma); | 687 | kenter(",%p", vma); |
688 | 688 | ||
@@ -703,7 +703,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
703 | } | 703 | } |
704 | 704 | ||
705 | /* add the VMA to the tree */ | 705 | /* add the VMA to the tree */ |
706 | parent = NULL; | 706 | parent = rb_prev = NULL; |
707 | p = &mm->mm_rb.rb_node; | 707 | p = &mm->mm_rb.rb_node; |
708 | while (*p) { | 708 | while (*p) { |
709 | parent = *p; | 709 | parent = *p; |
@@ -713,17 +713,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
713 | * (the latter is necessary as we may get identical VMAs) */ | 713 | * (the latter is necessary as we may get identical VMAs) */ |
714 | if (vma->vm_start < pvma->vm_start) | 714 | if (vma->vm_start < pvma->vm_start) |
715 | p = &(*p)->rb_left; | 715 | p = &(*p)->rb_left; |
716 | else if (vma->vm_start > pvma->vm_start) | 716 | else if (vma->vm_start > pvma->vm_start) { |
717 | rb_prev = parent; | ||
717 | p = &(*p)->rb_right; | 718 | p = &(*p)->rb_right; |
718 | else if (vma->vm_end < pvma->vm_end) | 719 | } else if (vma->vm_end < pvma->vm_end) |
719 | p = &(*p)->rb_left; | 720 | p = &(*p)->rb_left; |
720 | else if (vma->vm_end > pvma->vm_end) | 721 | else if (vma->vm_end > pvma->vm_end) { |
722 | rb_prev = parent; | ||
721 | p = &(*p)->rb_right; | 723 | p = &(*p)->rb_right; |
722 | else if (vma < pvma) | 724 | } else if (vma < pvma) |
723 | p = &(*p)->rb_left; | 725 | p = &(*p)->rb_left; |
724 | else if (vma > pvma) | 726 | else if (vma > pvma) { |
727 | rb_prev = parent; | ||
725 | p = &(*p)->rb_right; | 728 | p = &(*p)->rb_right; |
726 | else | 729 | } else |
727 | BUG(); | 730 | BUG(); |
728 | } | 731 | } |
729 | 732 | ||
@@ -731,20 +734,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
731 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | 734 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); |
732 | 735 | ||
733 | /* add VMA to the VMA list also */ | 736 | /* add VMA to the VMA list also */ |
734 | for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { | 737 | prev = NULL; |
735 | if (pvma->vm_start > vma->vm_start) | 738 | if (rb_prev) |
736 | break; | 739 | prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); |
737 | if (pvma->vm_start < vma->vm_start) | ||
738 | continue; | ||
739 | if (pvma->vm_end < vma->vm_end) | ||
740 | break; | ||
741 | } | ||
742 | 740 | ||
743 | next = *pp; | 741 | __vma_link_list(mm, vma, prev, parent); |
744 | *pp = vma; | ||
745 | vma->vm_next = next; | ||
746 | if (next) | ||
747 | next->vm_prev = vma; | ||
748 | } | 742 | } |
749 | 743 | ||
750 | /* | 744 | /* |
@@ -752,7 +746,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
752 | */ | 746 | */ |
753 | static void delete_vma_from_mm(struct vm_area_struct *vma) | 747 | static void delete_vma_from_mm(struct vm_area_struct *vma) |
754 | { | 748 | { |
755 | struct vm_area_struct **pp; | ||
756 | struct address_space *mapping; | 749 | struct address_space *mapping; |
757 | struct mm_struct *mm = vma->vm_mm; | 750 | struct mm_struct *mm = vma->vm_mm; |
758 | 751 | ||
@@ -775,12 +768,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
775 | 768 | ||
776 | /* remove from the MM's tree and list */ | 769 | /* remove from the MM's tree and list */ |
777 | rb_erase(&vma->vm_rb, &mm->mm_rb); | 770 | rb_erase(&vma->vm_rb, &mm->mm_rb); |
778 | for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { | 771 | |
779 | if (*pp == vma) { | 772 | if (vma->vm_prev) |
780 | *pp = vma->vm_next; | 773 | vma->vm_prev->vm_next = vma->vm_next; |
781 | break; | 774 | else |
782 | } | 775 | mm->mmap = vma->vm_next; |
783 | } | 776 | |
777 | if (vma->vm_next) | ||
778 | vma->vm_next->vm_prev = vma->vm_prev; | ||
784 | 779 | ||
785 | vma->vm_mm = NULL; | 780 | vma->vm_mm = NULL; |
786 | } | 781 | } |
@@ -809,17 +804,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | |||
809 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 804 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
810 | { | 805 | { |
811 | struct vm_area_struct *vma; | 806 | struct vm_area_struct *vma; |
812 | struct rb_node *n = mm->mm_rb.rb_node; | ||
813 | 807 | ||
814 | /* check the cache first */ | 808 | /* check the cache first */ |
815 | vma = mm->mmap_cache; | 809 | vma = mm->mmap_cache; |
816 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) | 810 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) |
817 | return vma; | 811 | return vma; |
818 | 812 | ||
819 | /* trawl the tree (there may be multiple mappings in which addr | 813 | /* trawl the list (there may be multiple mappings in which addr |
820 | * resides) */ | 814 | * resides) */ |
821 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | 815 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
822 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
823 | if (vma->vm_start > addr) | 816 | if (vma->vm_start > addr) |
824 | return NULL; | 817 | return NULL; |
825 | if (vma->vm_end > addr) { | 818 | if (vma->vm_end > addr) { |
@@ -859,7 +852,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | |||
859 | unsigned long len) | 852 | unsigned long len) |
860 | { | 853 | { |
861 | struct vm_area_struct *vma; | 854 | struct vm_area_struct *vma; |
862 | struct rb_node *n = mm->mm_rb.rb_node; | ||
863 | unsigned long end = addr + len; | 855 | unsigned long end = addr + len; |
864 | 856 | ||
865 | /* check the cache first */ | 857 | /* check the cache first */ |
@@ -867,10 +859,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | |||
867 | if (vma && vma->vm_start == addr && vma->vm_end == end) | 859 | if (vma && vma->vm_start == addr && vma->vm_end == end) |
868 | return vma; | 860 | return vma; |
869 | 861 | ||
870 | /* trawl the tree (there may be multiple mappings in which addr | 862 | /* trawl the list (there may be multiple mappings in which addr |
871 | * resides) */ | 863 | * resides) */ |
872 | for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { | 864 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
873 | vma = rb_entry(n, struct vm_area_struct, vm_rb); | ||
874 | if (vma->vm_start < addr) | 865 | if (vma->vm_start < addr) |
875 | continue; | 866 | continue; |
876 | if (vma->vm_start > addr) | 867 | if (vma->vm_start > addr) |
@@ -1133,7 +1124,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1133 | unsigned long capabilities) | 1124 | unsigned long capabilities) |
1134 | { | 1125 | { |
1135 | struct page *pages; | 1126 | struct page *pages; |
1136 | unsigned long total, point, n, rlen; | 1127 | unsigned long total, point, n; |
1137 | void *base; | 1128 | void *base; |
1138 | int ret, order; | 1129 | int ret, order; |
1139 | 1130 | ||
@@ -1157,13 +1148,12 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1157 | * make a private copy of the data and map that instead */ | 1148 | * make a private copy of the data and map that instead */ |
1158 | } | 1149 | } |
1159 | 1150 | ||
1160 | rlen = PAGE_ALIGN(len); | ||
1161 | 1151 | ||
1162 | /* allocate some memory to hold the mapping | 1152 | /* allocate some memory to hold the mapping |
1163 | * - note that this may not return a page-aligned address if the object | 1153 | * - note that this may not return a page-aligned address if the object |
1164 | * we're allocating is smaller than a page | 1154 | * we're allocating is smaller than a page |
1165 | */ | 1155 | */ |
1166 | order = get_order(rlen); | 1156 | order = get_order(len); |
1167 | kdebug("alloc order %d for %lx", order, len); | 1157 | kdebug("alloc order %d for %lx", order, len); |
1168 | 1158 | ||
1169 | pages = alloc_pages(GFP_KERNEL, order); | 1159 | pages = alloc_pages(GFP_KERNEL, order); |
@@ -1173,7 +1163,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1173 | total = 1 << order; | 1163 | total = 1 << order; |
1174 | atomic_long_add(total, &mmap_pages_allocated); | 1164 | atomic_long_add(total, &mmap_pages_allocated); |
1175 | 1165 | ||
1176 | point = rlen >> PAGE_SHIFT; | 1166 | point = len >> PAGE_SHIFT; |
1177 | 1167 | ||
1178 | /* we allocated a power-of-2 sized page set, so we may want to trim off | 1168 | /* we allocated a power-of-2 sized page set, so we may want to trim off |
1179 | * the excess */ | 1169 | * the excess */ |
@@ -1195,7 +1185,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1195 | base = page_address(pages); | 1185 | base = page_address(pages); |
1196 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; | 1186 | region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; |
1197 | region->vm_start = (unsigned long) base; | 1187 | region->vm_start = (unsigned long) base; |
1198 | region->vm_end = region->vm_start + rlen; | 1188 | region->vm_end = region->vm_start + len; |
1199 | region->vm_top = region->vm_start + (total << PAGE_SHIFT); | 1189 | region->vm_top = region->vm_start + (total << PAGE_SHIFT); |
1200 | 1190 | ||
1201 | vma->vm_start = region->vm_start; | 1191 | vma->vm_start = region->vm_start; |
@@ -1211,22 +1201,22 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1211 | 1201 | ||
1212 | old_fs = get_fs(); | 1202 | old_fs = get_fs(); |
1213 | set_fs(KERNEL_DS); | 1203 | set_fs(KERNEL_DS); |
1214 | ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); | 1204 | ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); |
1215 | set_fs(old_fs); | 1205 | set_fs(old_fs); |
1216 | 1206 | ||
1217 | if (ret < 0) | 1207 | if (ret < 0) |
1218 | goto error_free; | 1208 | goto error_free; |
1219 | 1209 | ||
1220 | /* clear the last little bit */ | 1210 | /* clear the last little bit */ |
1221 | if (ret < rlen) | 1211 | if (ret < len) |
1222 | memset(base + ret, 0, rlen - ret); | 1212 | memset(base + ret, 0, len - ret); |
1223 | 1213 | ||
1224 | } | 1214 | } |
1225 | 1215 | ||
1226 | return 0; | 1216 | return 0; |
1227 | 1217 | ||
1228 | error_free: | 1218 | error_free: |
1229 | free_page_series(region->vm_start, region->vm_end); | 1219 | free_page_series(region->vm_start, region->vm_top); |
1230 | region->vm_start = vma->vm_start = 0; | 1220 | region->vm_start = vma->vm_start = 0; |
1231 | region->vm_end = vma->vm_end = 0; | 1221 | region->vm_end = vma->vm_end = 0; |
1232 | region->vm_top = 0; | 1222 | region->vm_top = 0; |
@@ -1235,7 +1225,7 @@ error_free: | |||
1235 | enomem: | 1225 | enomem: |
1236 | printk("Allocation of length %lu from process %d (%s) failed\n", | 1226 | printk("Allocation of length %lu from process %d (%s) failed\n", |
1237 | len, current->pid, current->comm); | 1227 | len, current->pid, current->comm); |
1238 | show_free_areas(); | 1228 | show_free_areas(0); |
1239 | return -ENOMEM; | 1229 | return -ENOMEM; |
1240 | } | 1230 | } |
1241 | 1231 | ||
@@ -1268,6 +1258,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1268 | 1258 | ||
1269 | /* we ignore the address hint */ | 1259 | /* we ignore the address hint */ |
1270 | addr = 0; | 1260 | addr = 0; |
1261 | len = PAGE_ALIGN(len); | ||
1271 | 1262 | ||
1272 | /* we've determined that we can make the mapping, now translate what we | 1263 | /* we've determined that we can make the mapping, now translate what we |
1273 | * now know into VMA flags */ | 1264 | * now know into VMA flags */ |
@@ -1385,15 +1376,15 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1385 | if (capabilities & BDI_CAP_MAP_DIRECT) { | 1376 | if (capabilities & BDI_CAP_MAP_DIRECT) { |
1386 | addr = file->f_op->get_unmapped_area(file, addr, len, | 1377 | addr = file->f_op->get_unmapped_area(file, addr, len, |
1387 | pgoff, flags); | 1378 | pgoff, flags); |
1388 | if (IS_ERR((void *) addr)) { | 1379 | if (IS_ERR_VALUE(addr)) { |
1389 | ret = addr; | 1380 | ret = addr; |
1390 | if (ret != (unsigned long) -ENOSYS) | 1381 | if (ret != -ENOSYS) |
1391 | goto error_just_free; | 1382 | goto error_just_free; |
1392 | 1383 | ||
1393 | /* the driver refused to tell us where to site | 1384 | /* the driver refused to tell us where to site |
1394 | * the mapping so we'll have to attempt to copy | 1385 | * the mapping so we'll have to attempt to copy |
1395 | * it */ | 1386 | * it */ |
1396 | ret = (unsigned long) -ENODEV; | 1387 | ret = -ENODEV; |
1397 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1388 | if (!(capabilities & BDI_CAP_MAP_COPY)) |
1398 | goto error_just_free; | 1389 | goto error_just_free; |
1399 | 1390 | ||
@@ -1468,14 +1459,14 @@ error_getting_vma: | |||
1468 | printk(KERN_WARNING "Allocation of vma for %lu byte allocation" | 1459 | printk(KERN_WARNING "Allocation of vma for %lu byte allocation" |
1469 | " from process %d failed\n", | 1460 | " from process %d failed\n", |
1470 | len, current->pid); | 1461 | len, current->pid); |
1471 | show_free_areas(); | 1462 | show_free_areas(0); |
1472 | return -ENOMEM; | 1463 | return -ENOMEM; |
1473 | 1464 | ||
1474 | error_getting_region: | 1465 | error_getting_region: |
1475 | printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" | 1466 | printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" |
1476 | " from process %d failed\n", | 1467 | " from process %d failed\n", |
1477 | len, current->pid); | 1468 | len, current->pid); |
1478 | show_free_areas(); | 1469 | show_free_areas(0); |
1479 | return -ENOMEM; | 1470 | return -ENOMEM; |
1480 | } | 1471 | } |
1481 | EXPORT_SYMBOL(do_mmap_pgoff); | 1472 | EXPORT_SYMBOL(do_mmap_pgoff); |
@@ -1644,15 +1635,17 @@ static int shrink_vma(struct mm_struct *mm, | |||
1644 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | 1635 | int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) |
1645 | { | 1636 | { |
1646 | struct vm_area_struct *vma; | 1637 | struct vm_area_struct *vma; |
1647 | struct rb_node *rb; | 1638 | unsigned long end; |
1648 | unsigned long end = start + len; | ||
1649 | int ret; | 1639 | int ret; |
1650 | 1640 | ||
1651 | kenter(",%lx,%zx", start, len); | 1641 | kenter(",%lx,%zx", start, len); |
1652 | 1642 | ||
1643 | len = PAGE_ALIGN(len); | ||
1653 | if (len == 0) | 1644 | if (len == 0) |
1654 | return -EINVAL; | 1645 | return -EINVAL; |
1655 | 1646 | ||
1647 | end = start + len; | ||
1648 | |||
1656 | /* find the first potentially overlapping VMA */ | 1649 | /* find the first potentially overlapping VMA */ |
1657 | vma = find_vma(mm, start); | 1650 | vma = find_vma(mm, start); |
1658 | if (!vma) { | 1651 | if (!vma) { |
@@ -1677,9 +1670,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1677 | } | 1670 | } |
1678 | if (end == vma->vm_end) | 1671 | if (end == vma->vm_end) |
1679 | goto erase_whole_vma; | 1672 | goto erase_whole_vma; |
1680 | rb = rb_next(&vma->vm_rb); | 1673 | vma = vma->vm_next; |
1681 | vma = rb_entry(rb, struct vm_area_struct, vm_rb); | 1674 | } while (vma); |
1682 | } while (rb); | ||
1683 | kleave(" = -EINVAL [split file]"); | 1675 | kleave(" = -EINVAL [split file]"); |
1684 | return -EINVAL; | 1676 | return -EINVAL; |
1685 | } else { | 1677 | } else { |
@@ -1773,6 +1765,8 @@ unsigned long do_mremap(unsigned long addr, | |||
1773 | struct vm_area_struct *vma; | 1765 | struct vm_area_struct *vma; |
1774 | 1766 | ||
1775 | /* insanity checks first */ | 1767 | /* insanity checks first */ |
1768 | old_len = PAGE_ALIGN(old_len); | ||
1769 | new_len = PAGE_ALIGN(new_len); | ||
1776 | if (old_len == 0 || new_len == 0) | 1770 | if (old_len == 0 || new_len == 0) |
1777 | return (unsigned long) -EINVAL; | 1771 | return (unsigned long) -EINVAL; |
1778 | 1772 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f52e85c80e8d..e4b0991ca351 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -38,6 +38,33 @@ int sysctl_oom_kill_allocating_task; | |||
38 | int sysctl_oom_dump_tasks = 1; | 38 | int sysctl_oom_dump_tasks = 1; |
39 | static DEFINE_SPINLOCK(zone_scan_lock); | 39 | static DEFINE_SPINLOCK(zone_scan_lock); |
40 | 40 | ||
41 | /** | ||
42 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value | ||
43 | * @new_val: new oom_score_adj value | ||
44 | * | ||
45 | * Sets the oom_score_adj value for current to @new_val with proper | ||
46 | * synchronization and returns the old value. Usually used to temporarily | ||
47 | * set a value, save the old value in the caller, and then reinstate it later. | ||
48 | */ | ||
49 | int test_set_oom_score_adj(int new_val) | ||
50 | { | ||
51 | struct sighand_struct *sighand = current->sighand; | ||
52 | int old_val; | ||
53 | |||
54 | spin_lock_irq(&sighand->siglock); | ||
55 | old_val = current->signal->oom_score_adj; | ||
56 | if (new_val != old_val) { | ||
57 | if (new_val == OOM_SCORE_ADJ_MIN) | ||
58 | atomic_inc(¤t->mm->oom_disable_count); | ||
59 | else if (old_val == OOM_SCORE_ADJ_MIN) | ||
60 | atomic_dec(¤t->mm->oom_disable_count); | ||
61 | current->signal->oom_score_adj = new_val; | ||
62 | } | ||
63 | spin_unlock_irq(&sighand->siglock); | ||
64 | |||
65 | return old_val; | ||
66 | } | ||
67 | |||
41 | #ifdef CONFIG_NUMA | 68 | #ifdef CONFIG_NUMA |
42 | /** | 69 | /** |
43 | * has_intersects_mems_allowed() - check task eligiblity for kill | 70 | * has_intersects_mems_allowed() - check task eligiblity for kill |
@@ -155,15 +182,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | |||
155 | } | 182 | } |
156 | 183 | ||
157 | /* | 184 | /* |
158 | * When the PF_OOM_ORIGIN bit is set, it indicates the task should have | ||
159 | * priority for oom killing. | ||
160 | */ | ||
161 | if (p->flags & PF_OOM_ORIGIN) { | ||
162 | task_unlock(p); | ||
163 | return 1000; | ||
164 | } | ||
165 | |||
166 | /* | ||
167 | * The memory controller may have a limit of 0 bytes, so avoid a divide | 185 | * The memory controller may have a limit of 0 bytes, so avoid a divide |
168 | * by zero, if necessary. | 186 | * by zero, if necessary. |
169 | */ | 187 | */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3f8bce264df6..a4e1db3f1981 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/pagevec.h> | 30 | #include <linux/pagevec.h> |
31 | #include <linux/blkdev.h> | 31 | #include <linux/blkdev.h> |
32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/ratelimit.h> | ||
33 | #include <linux/oom.h> | 34 | #include <linux/oom.h> |
34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
35 | #include <linux/topology.h> | 36 | #include <linux/topology.h> |
@@ -39,6 +40,7 @@ | |||
39 | #include <linux/memory_hotplug.h> | 40 | #include <linux/memory_hotplug.h> |
40 | #include <linux/nodemask.h> | 41 | #include <linux/nodemask.h> |
41 | #include <linux/vmalloc.h> | 42 | #include <linux/vmalloc.h> |
43 | #include <linux/vmstat.h> | ||
42 | #include <linux/mempolicy.h> | 44 | #include <linux/mempolicy.h> |
43 | #include <linux/stop_machine.h> | 45 | #include <linux/stop_machine.h> |
44 | #include <linux/sort.h> | 46 | #include <linux/sort.h> |
@@ -54,6 +56,7 @@ | |||
54 | #include <trace/events/kmem.h> | 56 | #include <trace/events/kmem.h> |
55 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
56 | #include <linux/memcontrol.h> | 58 | #include <linux/memcontrol.h> |
59 | #include <linux/prefetch.h> | ||
57 | 60 | ||
58 | #include <asm/tlbflush.h> | 61 | #include <asm/tlbflush.h> |
59 | #include <asm/div64.h> | 62 | #include <asm/div64.h> |
@@ -1734,6 +1737,45 @@ static inline bool should_suppress_show_mem(void) | |||
1734 | return ret; | 1737 | return ret; |
1735 | } | 1738 | } |
1736 | 1739 | ||
1740 | static DEFINE_RATELIMIT_STATE(nopage_rs, | ||
1741 | DEFAULT_RATELIMIT_INTERVAL, | ||
1742 | DEFAULT_RATELIMIT_BURST); | ||
1743 | |||
1744 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | ||
1745 | { | ||
1746 | va_list args; | ||
1747 | unsigned int filter = SHOW_MEM_FILTER_NODES; | ||
1748 | |||
1749 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | ||
1750 | return; | ||
1751 | |||
1752 | /* | ||
1753 | * This documents exceptions given to allocations in certain | ||
1754 | * contexts that are allowed to allocate outside current's set | ||
1755 | * of allowed nodes. | ||
1756 | */ | ||
1757 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
1758 | if (test_thread_flag(TIF_MEMDIE) || | ||
1759 | (current->flags & (PF_MEMALLOC | PF_EXITING))) | ||
1760 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
1761 | if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) | ||
1762 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
1763 | |||
1764 | if (fmt) { | ||
1765 | printk(KERN_WARNING); | ||
1766 | va_start(args, fmt); | ||
1767 | vprintk(fmt, args); | ||
1768 | va_end(args); | ||
1769 | } | ||
1770 | |||
1771 | pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", | ||
1772 | current->comm, order, gfp_mask); | ||
1773 | |||
1774 | dump_stack(); | ||
1775 | if (!should_suppress_show_mem()) | ||
1776 | show_mem(filter); | ||
1777 | } | ||
1778 | |||
1737 | static inline int | 1779 | static inline int |
1738 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | 1780 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1739 | unsigned long pages_reclaimed) | 1781 | unsigned long pages_reclaimed) |
@@ -2064,6 +2106,7 @@ restart: | |||
2064 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | 2106 | first_zones_zonelist(zonelist, high_zoneidx, NULL, |
2065 | &preferred_zone); | 2107 | &preferred_zone); |
2066 | 2108 | ||
2109 | rebalance: | ||
2067 | /* This is the last chance, in general, before the goto nopage. */ | 2110 | /* This is the last chance, in general, before the goto nopage. */ |
2068 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2111 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2069 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2112 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
@@ -2071,7 +2114,6 @@ restart: | |||
2071 | if (page) | 2114 | if (page) |
2072 | goto got_pg; | 2115 | goto got_pg; |
2073 | 2116 | ||
2074 | rebalance: | ||
2075 | /* Allocate without watermarks if the context allows */ | 2117 | /* Allocate without watermarks if the context allows */ |
2076 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2118 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2077 | page = __alloc_pages_high_priority(gfp_mask, order, | 2119 | page = __alloc_pages_high_priority(gfp_mask, order, |
@@ -2105,7 +2147,7 @@ rebalance: | |||
2105 | sync_migration); | 2147 | sync_migration); |
2106 | if (page) | 2148 | if (page) |
2107 | goto got_pg; | 2149 | goto got_pg; |
2108 | sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); | 2150 | sync_migration = true; |
2109 | 2151 | ||
2110 | /* Try direct reclaim and then allocating */ | 2152 | /* Try direct reclaim and then allocating */ |
2111 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2153 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
@@ -2176,27 +2218,7 @@ rebalance: | |||
2176 | } | 2218 | } |
2177 | 2219 | ||
2178 | nopage: | 2220 | nopage: |
2179 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 2221 | warn_alloc_failed(gfp_mask, order, NULL); |
2180 | unsigned int filter = SHOW_MEM_FILTER_NODES; | ||
2181 | |||
2182 | /* | ||
2183 | * This documents exceptions given to allocations in certain | ||
2184 | * contexts that are allowed to allocate outside current's set | ||
2185 | * of allowed nodes. | ||
2186 | */ | ||
2187 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
2188 | if (test_thread_flag(TIF_MEMDIE) || | ||
2189 | (current->flags & (PF_MEMALLOC | PF_EXITING))) | ||
2190 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
2191 | if (in_interrupt() || !wait) | ||
2192 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
2193 | |||
2194 | pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n", | ||
2195 | current->comm, order, gfp_mask); | ||
2196 | dump_stack(); | ||
2197 | if (!should_suppress_show_mem()) | ||
2198 | show_mem(filter); | ||
2199 | } | ||
2200 | return page; | 2222 | return page; |
2201 | got_pg: | 2223 | got_pg: |
2202 | if (kmemcheck_enabled) | 2224 | if (kmemcheck_enabled) |
@@ -2225,6 +2247,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2225 | 2247 | ||
2226 | if (should_fail_alloc_page(gfp_mask, order)) | 2248 | if (should_fail_alloc_page(gfp_mask, order)) |
2227 | return NULL; | 2249 | return NULL; |
2250 | #ifndef CONFIG_ZONE_DMA | ||
2251 | if (WARN_ON_ONCE(gfp_mask & __GFP_DMA)) | ||
2252 | return NULL; | ||
2253 | #endif | ||
2228 | 2254 | ||
2229 | /* | 2255 | /* |
2230 | * Check the zones suitable for the gfp_mask contain at least one | 2256 | * Check the zones suitable for the gfp_mask contain at least one |
@@ -2472,10 +2498,10 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2472 | #endif | 2498 | #endif |
2473 | 2499 | ||
2474 | /* | 2500 | /* |
2475 | * Determine whether the zone's node should be displayed or not, depending on | 2501 | * Determine whether the node should be displayed or not, depending on whether |
2476 | * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). | 2502 | * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). |
2477 | */ | 2503 | */ |
2478 | static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) | 2504 | bool skip_free_areas_node(unsigned int flags, int nid) |
2479 | { | 2505 | { |
2480 | bool ret = false; | 2506 | bool ret = false; |
2481 | 2507 | ||
@@ -2483,8 +2509,7 @@ static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) | |||
2483 | goto out; | 2509 | goto out; |
2484 | 2510 | ||
2485 | get_mems_allowed(); | 2511 | get_mems_allowed(); |
2486 | ret = !node_isset(zone->zone_pgdat->node_id, | 2512 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2487 | cpuset_current_mems_allowed); | ||
2488 | put_mems_allowed(); | 2513 | put_mems_allowed(); |
2489 | out: | 2514 | out: |
2490 | return ret; | 2515 | return ret; |
@@ -2499,13 +2524,13 @@ out: | |||
2499 | * Suppresses nodes that are not allowed by current's cpuset if | 2524 | * Suppresses nodes that are not allowed by current's cpuset if |
2500 | * SHOW_MEM_FILTER_NODES is passed. | 2525 | * SHOW_MEM_FILTER_NODES is passed. |
2501 | */ | 2526 | */ |
2502 | void __show_free_areas(unsigned int filter) | 2527 | void show_free_areas(unsigned int filter) |
2503 | { | 2528 | { |
2504 | int cpu; | 2529 | int cpu; |
2505 | struct zone *zone; | 2530 | struct zone *zone; |
2506 | 2531 | ||
2507 | for_each_populated_zone(zone) { | 2532 | for_each_populated_zone(zone) { |
2508 | if (skip_free_areas_zone(filter, zone)) | 2533 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2509 | continue; | 2534 | continue; |
2510 | show_node(zone); | 2535 | show_node(zone); |
2511 | printk("%s per-cpu:\n", zone->name); | 2536 | printk("%s per-cpu:\n", zone->name); |
@@ -2548,7 +2573,7 @@ void __show_free_areas(unsigned int filter) | |||
2548 | for_each_populated_zone(zone) { | 2573 | for_each_populated_zone(zone) { |
2549 | int i; | 2574 | int i; |
2550 | 2575 | ||
2551 | if (skip_free_areas_zone(filter, zone)) | 2576 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2552 | continue; | 2577 | continue; |
2553 | show_node(zone); | 2578 | show_node(zone); |
2554 | printk("%s" | 2579 | printk("%s" |
@@ -2617,7 +2642,7 @@ void __show_free_areas(unsigned int filter) | |||
2617 | for_each_populated_zone(zone) { | 2642 | for_each_populated_zone(zone) { |
2618 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 2643 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
2619 | 2644 | ||
2620 | if (skip_free_areas_zone(filter, zone)) | 2645 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2621 | continue; | 2646 | continue; |
2622 | show_node(zone); | 2647 | show_node(zone); |
2623 | printk("%s: ", zone->name); | 2648 | printk("%s: ", zone->name); |
@@ -2638,11 +2663,6 @@ void __show_free_areas(unsigned int filter) | |||
2638 | show_swap_cache_info(); | 2663 | show_swap_cache_info(); |
2639 | } | 2664 | } |
2640 | 2665 | ||
2641 | void show_free_areas(void) | ||
2642 | { | ||
2643 | __show_free_areas(0); | ||
2644 | } | ||
2645 | |||
2646 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | 2666 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) |
2647 | { | 2667 | { |
2648 | zoneref->zone = zone; | 2668 | zoneref->zone = zone; |
@@ -3313,6 +3333,20 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
3313 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 3333 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
3314 | 3334 | ||
3315 | /* | 3335 | /* |
3336 | * Check if a pageblock contains reserved pages | ||
3337 | */ | ||
3338 | static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) | ||
3339 | { | ||
3340 | unsigned long pfn; | ||
3341 | |||
3342 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
3343 | if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) | ||
3344 | return 1; | ||
3345 | } | ||
3346 | return 0; | ||
3347 | } | ||
3348 | |||
3349 | /* | ||
3316 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 3350 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
3317 | * of blocks reserved is based on min_wmark_pages(zone). The memory within | 3351 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
3318 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes | 3352 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
@@ -3321,7 +3355,7 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
3321 | */ | 3355 | */ |
3322 | static void setup_zone_migrate_reserve(struct zone *zone) | 3356 | static void setup_zone_migrate_reserve(struct zone *zone) |
3323 | { | 3357 | { |
3324 | unsigned long start_pfn, pfn, end_pfn; | 3358 | unsigned long start_pfn, pfn, end_pfn, block_end_pfn; |
3325 | struct page *page; | 3359 | struct page *page; |
3326 | unsigned long block_migratetype; | 3360 | unsigned long block_migratetype; |
3327 | int reserve; | 3361 | int reserve; |
@@ -3351,7 +3385,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3351 | continue; | 3385 | continue; |
3352 | 3386 | ||
3353 | /* Blocks with reserved pages will never free, skip them. */ | 3387 | /* Blocks with reserved pages will never free, skip them. */ |
3354 | if (PageReserved(page)) | 3388 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); |
3389 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3355 | continue; | 3390 | continue; |
3356 | 3391 | ||
3357 | block_migratetype = get_pageblock_migratetype(page); | 3392 | block_migratetype = get_pageblock_migratetype(page); |
@@ -3540,7 +3575,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3540 | pcp->batch = PAGE_SHIFT * 8; | 3575 | pcp->batch = PAGE_SHIFT * 8; |
3541 | } | 3576 | } |
3542 | 3577 | ||
3543 | static __meminit void setup_zone_pageset(struct zone *zone) | 3578 | static void setup_zone_pageset(struct zone *zone) |
3544 | { | 3579 | { |
3545 | int cpu; | 3580 | int cpu; |
3546 | 3581 | ||
@@ -4288,10 +4323,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4288 | zone->zone_pgdat = pgdat; | 4323 | zone->zone_pgdat = pgdat; |
4289 | 4324 | ||
4290 | zone_pcp_init(zone); | 4325 | zone_pcp_init(zone); |
4291 | for_each_lru(l) { | 4326 | for_each_lru(l) |
4292 | INIT_LIST_HEAD(&zone->lru[l].list); | 4327 | INIT_LIST_HEAD(&zone->lru[l].list); |
4293 | zone->reclaim_stat.nr_saved_scan[l] = 0; | ||
4294 | } | ||
4295 | zone->reclaim_stat.recent_rotated[0] = 0; | 4328 | zone->reclaim_stat.recent_rotated[0] = 0; |
4296 | zone->reclaim_stat.recent_rotated[1] = 0; | 4329 | zone->reclaim_stat.recent_rotated[1] = 0; |
4297 | zone->reclaim_stat.recent_scanned[0] = 0; | 4330 | zone->reclaim_stat.recent_scanned[0] = 0; |
@@ -5099,7 +5132,7 @@ void setup_per_zone_wmarks(void) | |||
5099 | * 1TB 101 10GB | 5132 | * 1TB 101 10GB |
5100 | * 10TB 320 32GB | 5133 | * 10TB 320 32GB |
5101 | */ | 5134 | */ |
5102 | void calculate_zone_inactive_ratio(struct zone *zone) | 5135 | static void __meminit calculate_zone_inactive_ratio(struct zone *zone) |
5103 | { | 5136 | { |
5104 | unsigned int gb, ratio; | 5137 | unsigned int gb, ratio; |
5105 | 5138 | ||
@@ -5113,7 +5146,7 @@ void calculate_zone_inactive_ratio(struct zone *zone) | |||
5113 | zone->inactive_ratio = ratio; | 5146 | zone->inactive_ratio = ratio; |
5114 | } | 5147 | } |
5115 | 5148 | ||
5116 | static void __init setup_per_zone_inactive_ratio(void) | 5149 | static void __meminit setup_per_zone_inactive_ratio(void) |
5117 | { | 5150 | { |
5118 | struct zone *zone; | 5151 | struct zone *zone; |
5119 | 5152 | ||
@@ -5145,7 +5178,7 @@ static void __init setup_per_zone_inactive_ratio(void) | |||
5145 | * 8192MB: 11584k | 5178 | * 8192MB: 11584k |
5146 | * 16384MB: 16384k | 5179 | * 16384MB: 16384k |
5147 | */ | 5180 | */ |
5148 | static int __init init_per_zone_wmark_min(void) | 5181 | int __meminit init_per_zone_wmark_min(void) |
5149 | { | 5182 | { |
5150 | unsigned long lowmem_kbytes; | 5183 | unsigned long lowmem_kbytes; |
5151 | 5184 | ||
@@ -5157,6 +5190,7 @@ static int __init init_per_zone_wmark_min(void) | |||
5157 | if (min_free_kbytes > 65536) | 5190 | if (min_free_kbytes > 65536) |
5158 | min_free_kbytes = 65536; | 5191 | min_free_kbytes = 65536; |
5159 | setup_per_zone_wmarks(); | 5192 | setup_per_zone_wmarks(); |
5193 | refresh_zone_stat_thresholds(); | ||
5160 | setup_per_zone_lowmem_reserve(); | 5194 | setup_per_zone_lowmem_reserve(); |
5161 | setup_per_zone_inactive_ratio(); | 5195 | setup_per_zone_inactive_ratio(); |
5162 | return 0; | 5196 | return 0; |
@@ -5507,10 +5541,8 @@ int set_migratetype_isolate(struct page *page) | |||
5507 | struct memory_isolate_notify arg; | 5541 | struct memory_isolate_notify arg; |
5508 | int notifier_ret; | 5542 | int notifier_ret; |
5509 | int ret = -EBUSY; | 5543 | int ret = -EBUSY; |
5510 | int zone_idx; | ||
5511 | 5544 | ||
5512 | zone = page_zone(page); | 5545 | zone = page_zone(page); |
5513 | zone_idx = zone_idx(zone); | ||
5514 | 5546 | ||
5515 | spin_lock_irqsave(&zone->lock, flags); | 5547 | spin_lock_irqsave(&zone->lock, flags); |
5516 | 5548 | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 2daadc322ba6..74ccff61d1be 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -130,7 +130,7 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) | |||
130 | return page; | 130 | return page; |
131 | } | 131 | } |
132 | 132 | ||
133 | static void *__init_refok alloc_page_cgroup(size_t size, int nid) | 133 | static void *__meminit alloc_page_cgroup(size_t size, int nid) |
134 | { | 134 | { |
135 | void *addr = NULL; | 135 | void *addr = NULL; |
136 | 136 | ||
@@ -162,7 +162,7 @@ static void free_page_cgroup(void *addr) | |||
162 | } | 162 | } |
163 | #endif | 163 | #endif |
164 | 164 | ||
165 | static int __init_refok init_section_page_cgroup(unsigned long pfn) | 165 | static int __meminit init_section_page_cgroup(unsigned long pfn) |
166 | { | 166 | { |
167 | struct page_cgroup *base, *pc; | 167 | struct page_cgroup *base, *pc; |
168 | struct mem_section *section; | 168 | struct mem_section *section; |
@@ -475,7 +475,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
475 | if (!do_swap_account) | 475 | if (!do_swap_account) |
476 | return 0; | 476 | return 0; |
477 | 477 | ||
478 | length = ((max_pages/SC_PER_PAGE) + 1); | 478 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); |
479 | array_size = length * sizeof(void *); | 479 | array_size = length * sizeof(void *); |
480 | 480 | ||
481 | array = vmalloc(array_size); | 481 | array = vmalloc(array_size); |
@@ -492,8 +492,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
492 | /* memory shortage */ | 492 | /* memory shortage */ |
493 | ctrl->map = NULL; | 493 | ctrl->map = NULL; |
494 | ctrl->length = 0; | 494 | ctrl->length = 0; |
495 | vfree(array); | ||
496 | mutex_unlock(&swap_cgroup_mutex); | 495 | mutex_unlock(&swap_cgroup_mutex); |
496 | vfree(array); | ||
497 | goto nomem; | 497 | goto nomem; |
498 | } | 498 | } |
499 | mutex_unlock(&swap_cgroup_mutex); | 499 | mutex_unlock(&swap_cgroup_mutex); |
@@ -508,7 +508,8 @@ nomem: | |||
508 | 508 | ||
509 | void swap_cgroup_swapoff(int type) | 509 | void swap_cgroup_swapoff(int type) |
510 | { | 510 | { |
511 | int i; | 511 | struct page **map; |
512 | unsigned long i, length; | ||
512 | struct swap_cgroup_ctrl *ctrl; | 513 | struct swap_cgroup_ctrl *ctrl; |
513 | 514 | ||
514 | if (!do_swap_account) | 515 | if (!do_swap_account) |
@@ -516,17 +517,20 @@ void swap_cgroup_swapoff(int type) | |||
516 | 517 | ||
517 | mutex_lock(&swap_cgroup_mutex); | 518 | mutex_lock(&swap_cgroup_mutex); |
518 | ctrl = &swap_cgroup_ctrl[type]; | 519 | ctrl = &swap_cgroup_ctrl[type]; |
519 | if (ctrl->map) { | 520 | map = ctrl->map; |
520 | for (i = 0; i < ctrl->length; i++) { | 521 | length = ctrl->length; |
521 | struct page *page = ctrl->map[i]; | 522 | ctrl->map = NULL; |
523 | ctrl->length = 0; | ||
524 | mutex_unlock(&swap_cgroup_mutex); | ||
525 | |||
526 | if (map) { | ||
527 | for (i = 0; i < length; i++) { | ||
528 | struct page *page = map[i]; | ||
522 | if (page) | 529 | if (page) |
523 | __free_page(page); | 530 | __free_page(page); |
524 | } | 531 | } |
525 | vfree(ctrl->map); | 532 | vfree(map); |
526 | ctrl->map = NULL; | ||
527 | ctrl->length = 0; | ||
528 | } | 533 | } |
529 | mutex_unlock(&swap_cgroup_mutex); | ||
530 | } | 534 | } |
531 | 535 | ||
532 | #endif | 536 | #endif |
diff --git a/mm/percpu.c b/mm/percpu.c index a160db39b810..bf80e55dbed7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1215,8 +1215,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1215 | PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); | 1215 | PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); |
1216 | #ifdef CONFIG_SMP | 1216 | #ifdef CONFIG_SMP |
1217 | PCPU_SETUP_BUG_ON(!ai->static_size); | 1217 | PCPU_SETUP_BUG_ON(!ai->static_size); |
1218 | PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); | ||
1218 | #endif | 1219 | #endif |
1219 | PCPU_SETUP_BUG_ON(!base_addr); | 1220 | PCPU_SETUP_BUG_ON(!base_addr); |
1221 | PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); | ||
1220 | PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); | 1222 | PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); |
1221 | PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); | 1223 | PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); |
1222 | PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); | 1224 | PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); |
@@ -1645,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1645 | /* warn if maximum distance is further than 75% of vmalloc space */ | 1647 | /* warn if maximum distance is further than 75% of vmalloc space */ |
1646 | if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { | 1648 | if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { |
1647 | pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " | 1649 | pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " |
1648 | "space 0x%lx\n", | 1650 | "space 0x%lx\n", max_distance, |
1649 | max_distance, VMALLOC_END - VMALLOC_START); | 1651 | (unsigned long)(VMALLOC_END - VMALLOC_START)); |
1650 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK | 1652 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK |
1651 | /* and fail if we have fallback */ | 1653 | /* and fail if we have fallback */ |
1652 | rc = -EINVAL; | 1654 | rc = -EINVAL; |
diff --git a/mm/prio_tree.c b/mm/prio_tree.c index 603ae98d9694..799dcfd7cd8c 100644 --- a/mm/prio_tree.c +++ b/mm/prio_tree.c | |||
@@ -13,6 +13,7 @@ | |||
13 | 13 | ||
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/prio_tree.h> | 15 | #include <linux/prio_tree.h> |
16 | #include <linux/prefetch.h> | ||
16 | 17 | ||
17 | /* | 18 | /* |
18 | * See lib/prio_tree.c for details on the general radix priority search tree | 19 | * See lib/prio_tree.c for details on the general radix priority search tree |
diff --git a/mm/readahead.c b/mm/readahead.c index 2c0cc489e288..867f9dd82dcd 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -180,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
180 | if (page) | 180 | if (page) |
181 | continue; | 181 | continue; |
182 | 182 | ||
183 | page = page_cache_alloc_cold(mapping); | 183 | page = page_cache_alloc_readahead(mapping); |
184 | if (!page) | 184 | if (!page) |
185 | break; | 185 | break; |
186 | page->index = page_offset; | 186 | page->index = page_offset; |
@@ -24,8 +24,8 @@ | |||
24 | * inode->i_alloc_sem (vmtruncate_range) | 24 | * inode->i_alloc_sem (vmtruncate_range) |
25 | * mm->mmap_sem | 25 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 26 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_lock | 27 | * mapping->i_mmap_mutex |
28 | * anon_vma->lock | 28 | * anon_vma->mutex |
29 | * mm->page_table_lock or pte_lock | 29 | * mm->page_table_lock or pte_lock |
30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
@@ -40,7 +40,7 @@ | |||
40 | * | 40 | * |
41 | * (code doesn't rely on that order so it could be switched around) | 41 | * (code doesn't rely on that order so it could be switched around) |
42 | * ->tasklist_lock | 42 | * ->tasklist_lock |
43 | * anon_vma->lock (memory_failure, collect_procs_anon) | 43 | * anon_vma->mutex (memory_failure, collect_procs_anon) |
44 | * pte map lock | 44 | * pte map lock |
45 | */ | 45 | */ |
46 | 46 | ||
@@ -86,6 +86,29 @@ static inline struct anon_vma *anon_vma_alloc(void) | |||
86 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 86 | static inline void anon_vma_free(struct anon_vma *anon_vma) |
87 | { | 87 | { |
88 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 88 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
89 | |||
90 | /* | ||
91 | * Synchronize against page_lock_anon_vma() such that | ||
92 | * we can safely hold the lock without the anon_vma getting | ||
93 | * freed. | ||
94 | * | ||
95 | * Relies on the full mb implied by the atomic_dec_and_test() from | ||
96 | * put_anon_vma() against the acquire barrier implied by | ||
97 | * mutex_trylock() from page_lock_anon_vma(). This orders: | ||
98 | * | ||
99 | * page_lock_anon_vma() VS put_anon_vma() | ||
100 | * mutex_trylock() atomic_dec_and_test() | ||
101 | * LOCK MB | ||
102 | * atomic_read() mutex_is_locked() | ||
103 | * | ||
104 | * LOCK should suffice since the actual taking of the lock must | ||
105 | * happen _before_ what follows. | ||
106 | */ | ||
107 | if (mutex_is_locked(&anon_vma->root->mutex)) { | ||
108 | anon_vma_lock(anon_vma); | ||
109 | anon_vma_unlock(anon_vma); | ||
110 | } | ||
111 | |||
89 | kmem_cache_free(anon_vma_cachep, anon_vma); | 112 | kmem_cache_free(anon_vma_cachep, anon_vma); |
90 | } | 113 | } |
91 | 114 | ||
@@ -307,7 +330,7 @@ static void anon_vma_ctor(void *data) | |||
307 | { | 330 | { |
308 | struct anon_vma *anon_vma = data; | 331 | struct anon_vma *anon_vma = data; |
309 | 332 | ||
310 | spin_lock_init(&anon_vma->lock); | 333 | mutex_init(&anon_vma->mutex); |
311 | atomic_set(&anon_vma->refcount, 0); | 334 | atomic_set(&anon_vma->refcount, 0); |
312 | INIT_LIST_HEAD(&anon_vma->head); | 335 | INIT_LIST_HEAD(&anon_vma->head); |
313 | } | 336 | } |
@@ -320,12 +343,26 @@ void __init anon_vma_init(void) | |||
320 | } | 343 | } |
321 | 344 | ||
322 | /* | 345 | /* |
323 | * Getting a lock on a stable anon_vma from a page off the LRU is | 346 | * Getting a lock on a stable anon_vma from a page off the LRU is tricky! |
324 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 347 | * |
348 | * Since there is no serialization what so ever against page_remove_rmap() | ||
349 | * the best this function can do is return a locked anon_vma that might | ||
350 | * have been relevant to this page. | ||
351 | * | ||
352 | * The page might have been remapped to a different anon_vma or the anon_vma | ||
353 | * returned may already be freed (and even reused). | ||
354 | * | ||
355 | * All users of this function must be very careful when walking the anon_vma | ||
356 | * chain and verify that the page in question is indeed mapped in it | ||
357 | * [ something equivalent to page_mapped_in_vma() ]. | ||
358 | * | ||
359 | * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() | ||
360 | * that the anon_vma pointer from page->mapping is valid if there is a | ||
361 | * mapcount, we can dereference the anon_vma after observing those. | ||
325 | */ | 362 | */ |
326 | struct anon_vma *__page_lock_anon_vma(struct page *page) | 363 | struct anon_vma *page_get_anon_vma(struct page *page) |
327 | { | 364 | { |
328 | struct anon_vma *anon_vma, *root_anon_vma; | 365 | struct anon_vma *anon_vma = NULL; |
329 | unsigned long anon_mapping; | 366 | unsigned long anon_mapping; |
330 | 367 | ||
331 | rcu_read_lock(); | 368 | rcu_read_lock(); |
@@ -336,32 +373,97 @@ struct anon_vma *__page_lock_anon_vma(struct page *page) | |||
336 | goto out; | 373 | goto out; |
337 | 374 | ||
338 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 375 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
339 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 376 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { |
340 | spin_lock(&root_anon_vma->lock); | 377 | anon_vma = NULL; |
378 | goto out; | ||
379 | } | ||
341 | 380 | ||
342 | /* | 381 | /* |
343 | * If this page is still mapped, then its anon_vma cannot have been | 382 | * If this page is still mapped, then its anon_vma cannot have been |
344 | * freed. But if it has been unmapped, we have no security against | 383 | * freed. But if it has been unmapped, we have no security against the |
345 | * the anon_vma structure being freed and reused (for another anon_vma: | 384 | * anon_vma structure being freed and reused (for another anon_vma: |
346 | * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot | 385 | * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() |
347 | * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting | 386 | * above cannot corrupt). |
348 | * anon_vma->root before page_unlock_anon_vma() is called to unlock. | ||
349 | */ | 387 | */ |
350 | if (page_mapped(page)) | 388 | if (!page_mapped(page)) { |
351 | return anon_vma; | 389 | put_anon_vma(anon_vma); |
390 | anon_vma = NULL; | ||
391 | } | ||
392 | out: | ||
393 | rcu_read_unlock(); | ||
394 | |||
395 | return anon_vma; | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Similar to page_get_anon_vma() except it locks the anon_vma. | ||
400 | * | ||
401 | * Its a little more complex as it tries to keep the fast path to a single | ||
402 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | ||
403 | * reference like with page_get_anon_vma() and then block on the mutex. | ||
404 | */ | ||
405 | struct anon_vma *page_lock_anon_vma(struct page *page) | ||
406 | { | ||
407 | struct anon_vma *anon_vma = NULL; | ||
408 | unsigned long anon_mapping; | ||
409 | |||
410 | rcu_read_lock(); | ||
411 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | ||
412 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | ||
413 | goto out; | ||
414 | if (!page_mapped(page)) | ||
415 | goto out; | ||
416 | |||
417 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | ||
418 | if (mutex_trylock(&anon_vma->root->mutex)) { | ||
419 | /* | ||
420 | * If we observe a !0 refcount, then holding the lock ensures | ||
421 | * the anon_vma will not go away, see __put_anon_vma(). | ||
422 | */ | ||
423 | if (!atomic_read(&anon_vma->refcount)) { | ||
424 | anon_vma_unlock(anon_vma); | ||
425 | anon_vma = NULL; | ||
426 | } | ||
427 | goto out; | ||
428 | } | ||
429 | |||
430 | /* trylock failed, we got to sleep */ | ||
431 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { | ||
432 | anon_vma = NULL; | ||
433 | goto out; | ||
434 | } | ||
435 | |||
436 | if (!page_mapped(page)) { | ||
437 | put_anon_vma(anon_vma); | ||
438 | anon_vma = NULL; | ||
439 | goto out; | ||
440 | } | ||
441 | |||
442 | /* we pinned the anon_vma, its safe to sleep */ | ||
443 | rcu_read_unlock(); | ||
444 | anon_vma_lock(anon_vma); | ||
445 | |||
446 | if (atomic_dec_and_test(&anon_vma->refcount)) { | ||
447 | /* | ||
448 | * Oops, we held the last refcount, release the lock | ||
449 | * and bail -- can't simply use put_anon_vma() because | ||
450 | * we'll deadlock on the anon_vma_lock() recursion. | ||
451 | */ | ||
452 | anon_vma_unlock(anon_vma); | ||
453 | __put_anon_vma(anon_vma); | ||
454 | anon_vma = NULL; | ||
455 | } | ||
456 | |||
457 | return anon_vma; | ||
352 | 458 | ||
353 | spin_unlock(&root_anon_vma->lock); | ||
354 | out: | 459 | out: |
355 | rcu_read_unlock(); | 460 | rcu_read_unlock(); |
356 | return NULL; | 461 | return anon_vma; |
357 | } | 462 | } |
358 | 463 | ||
359 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 464 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
360 | __releases(&anon_vma->root->lock) | ||
361 | __releases(RCU) | ||
362 | { | 465 | { |
363 | anon_vma_unlock(anon_vma); | 466 | anon_vma_unlock(anon_vma); |
364 | rcu_read_unlock(); | ||
365 | } | 467 | } |
366 | 468 | ||
367 | /* | 469 | /* |
@@ -646,14 +748,14 @@ static int page_referenced_file(struct page *page, | |||
646 | * The page lock not only makes sure that page->mapping cannot | 748 | * The page lock not only makes sure that page->mapping cannot |
647 | * suddenly be NULLified by truncation, it makes sure that the | 749 | * suddenly be NULLified by truncation, it makes sure that the |
648 | * structure at mapping cannot be freed and reused yet, | 750 | * structure at mapping cannot be freed and reused yet, |
649 | * so we can safely take mapping->i_mmap_lock. | 751 | * so we can safely take mapping->i_mmap_mutex. |
650 | */ | 752 | */ |
651 | BUG_ON(!PageLocked(page)); | 753 | BUG_ON(!PageLocked(page)); |
652 | 754 | ||
653 | spin_lock(&mapping->i_mmap_lock); | 755 | mutex_lock(&mapping->i_mmap_mutex); |
654 | 756 | ||
655 | /* | 757 | /* |
656 | * i_mmap_lock does not stabilize mapcount at all, but mapcount | 758 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount |
657 | * is more likely to be accurate if we note it after spinning. | 759 | * is more likely to be accurate if we note it after spinning. |
658 | */ | 760 | */ |
659 | mapcount = page_mapcount(page); | 761 | mapcount = page_mapcount(page); |
@@ -675,7 +777,7 @@ static int page_referenced_file(struct page *page, | |||
675 | break; | 777 | break; |
676 | } | 778 | } |
677 | 779 | ||
678 | spin_unlock(&mapping->i_mmap_lock); | 780 | mutex_unlock(&mapping->i_mmap_mutex); |
679 | return referenced; | 781 | return referenced; |
680 | } | 782 | } |
681 | 783 | ||
@@ -719,7 +821,7 @@ int page_referenced(struct page *page, | |||
719 | unlock_page(page); | 821 | unlock_page(page); |
720 | } | 822 | } |
721 | out: | 823 | out: |
722 | if (page_test_and_clear_young(page)) | 824 | if (page_test_and_clear_young(page_to_pfn(page))) |
723 | referenced++; | 825 | referenced++; |
724 | 826 | ||
725 | return referenced; | 827 | return referenced; |
@@ -762,7 +864,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
762 | 864 | ||
763 | BUG_ON(PageAnon(page)); | 865 | BUG_ON(PageAnon(page)); |
764 | 866 | ||
765 | spin_lock(&mapping->i_mmap_lock); | 867 | mutex_lock(&mapping->i_mmap_mutex); |
766 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 868 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
767 | if (vma->vm_flags & VM_SHARED) { | 869 | if (vma->vm_flags & VM_SHARED) { |
768 | unsigned long address = vma_address(page, vma); | 870 | unsigned long address = vma_address(page, vma); |
@@ -771,7 +873,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
771 | ret += page_mkclean_one(page, vma, address); | 873 | ret += page_mkclean_one(page, vma, address); |
772 | } | 874 | } |
773 | } | 875 | } |
774 | spin_unlock(&mapping->i_mmap_lock); | 876 | mutex_unlock(&mapping->i_mmap_mutex); |
775 | return ret; | 877 | return ret; |
776 | } | 878 | } |
777 | 879 | ||
@@ -785,10 +887,8 @@ int page_mkclean(struct page *page) | |||
785 | struct address_space *mapping = page_mapping(page); | 887 | struct address_space *mapping = page_mapping(page); |
786 | if (mapping) { | 888 | if (mapping) { |
787 | ret = page_mkclean_file(mapping, page); | 889 | ret = page_mkclean_file(mapping, page); |
788 | if (page_test_dirty(page)) { | 890 | if (page_test_and_clear_dirty(page_to_pfn(page), 1)) |
789 | page_clear_dirty(page, 1); | ||
790 | ret = 1; | 891 | ret = 1; |
791 | } | ||
792 | } | 892 | } |
793 | } | 893 | } |
794 | 894 | ||
@@ -981,10 +1081,9 @@ void page_remove_rmap(struct page *page) | |||
981 | * not if it's in swapcache - there might be another pte slot | 1081 | * not if it's in swapcache - there might be another pte slot |
982 | * containing the swap entry, but page not yet written to swap. | 1082 | * containing the swap entry, but page not yet written to swap. |
983 | */ | 1083 | */ |
984 | if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { | 1084 | if ((!PageAnon(page) || PageSwapCache(page)) && |
985 | page_clear_dirty(page, 1); | 1085 | page_test_and_clear_dirty(page_to_pfn(page), 1)) |
986 | set_page_dirty(page); | 1086 | set_page_dirty(page); |
987 | } | ||
988 | /* | 1087 | /* |
989 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED | 1088 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED |
990 | * and not charged by memcg for now. | 1089 | * and not charged by memcg for now. |
@@ -1122,7 +1221,7 @@ out_mlock: | |||
1122 | /* | 1221 | /* |
1123 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1222 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1124 | * unstable result and race. Plus, We can't wait here because | 1223 | * unstable result and race. Plus, We can't wait here because |
1125 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | 1224 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. |
1126 | * if trylock failed, the page remain in evictable lru and later | 1225 | * if trylock failed, the page remain in evictable lru and later |
1127 | * vmscan could retry to move the page to unevictable lru if the | 1226 | * vmscan could retry to move the page to unevictable lru if the |
1128 | * page is actually mlocked. | 1227 | * page is actually mlocked. |
@@ -1348,7 +1447,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1348 | unsigned long max_nl_size = 0; | 1447 | unsigned long max_nl_size = 0; |
1349 | unsigned int mapcount; | 1448 | unsigned int mapcount; |
1350 | 1449 | ||
1351 | spin_lock(&mapping->i_mmap_lock); | 1450 | mutex_lock(&mapping->i_mmap_mutex); |
1352 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1451 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1353 | unsigned long address = vma_address(page, vma); | 1452 | unsigned long address = vma_address(page, vma); |
1354 | if (address == -EFAULT) | 1453 | if (address == -EFAULT) |
@@ -1394,7 +1493,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1394 | mapcount = page_mapcount(page); | 1493 | mapcount = page_mapcount(page); |
1395 | if (!mapcount) | 1494 | if (!mapcount) |
1396 | goto out; | 1495 | goto out; |
1397 | cond_resched_lock(&mapping->i_mmap_lock); | 1496 | cond_resched(); |
1398 | 1497 | ||
1399 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1498 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
1400 | if (max_nl_cursor == 0) | 1499 | if (max_nl_cursor == 0) |
@@ -1416,7 +1515,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1416 | } | 1515 | } |
1417 | vma->vm_private_data = (void *) max_nl_cursor; | 1516 | vma->vm_private_data = (void *) max_nl_cursor; |
1418 | } | 1517 | } |
1419 | cond_resched_lock(&mapping->i_mmap_lock); | 1518 | cond_resched(); |
1420 | max_nl_cursor += CLUSTER_SIZE; | 1519 | max_nl_cursor += CLUSTER_SIZE; |
1421 | } while (max_nl_cursor <= max_nl_size); | 1520 | } while (max_nl_cursor <= max_nl_size); |
1422 | 1521 | ||
@@ -1428,7 +1527,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1428 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1527 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) |
1429 | vma->vm_private_data = NULL; | 1528 | vma->vm_private_data = NULL; |
1430 | out: | 1529 | out: |
1431 | spin_unlock(&mapping->i_mmap_lock); | 1530 | mutex_unlock(&mapping->i_mmap_mutex); |
1432 | return ret; | 1531 | return ret; |
1433 | } | 1532 | } |
1434 | 1533 | ||
@@ -1547,7 +1646,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1547 | 1646 | ||
1548 | if (!mapping) | 1647 | if (!mapping) |
1549 | return ret; | 1648 | return ret; |
1550 | spin_lock(&mapping->i_mmap_lock); | 1649 | mutex_lock(&mapping->i_mmap_mutex); |
1551 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1650 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1552 | unsigned long address = vma_address(page, vma); | 1651 | unsigned long address = vma_address(page, vma); |
1553 | if (address == -EFAULT) | 1652 | if (address == -EFAULT) |
@@ -1561,7 +1660,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1561 | * never contain migration ptes. Decide what to do about this | 1660 | * never contain migration ptes. Decide what to do about this |
1562 | * limitation to linear when we need rmap_walk() on nonlinear. | 1661 | * limitation to linear when we need rmap_walk() on nonlinear. |
1563 | */ | 1662 | */ |
1564 | spin_unlock(&mapping->i_mmap_lock); | 1663 | mutex_unlock(&mapping->i_mmap_mutex); |
1565 | return ret; | 1664 | return ret; |
1566 | } | 1665 | } |
1567 | 1666 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index dfc7069102ee..1acfb2687bfa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -99,6 +99,13 @@ static struct vfsmount *shm_mnt; | |||
99 | /* Pretend that each entry is of this size in directory's i_size */ | 99 | /* Pretend that each entry is of this size in directory's i_size */ |
100 | #define BOGO_DIRENT_SIZE 20 | 100 | #define BOGO_DIRENT_SIZE 20 |
101 | 101 | ||
102 | struct shmem_xattr { | ||
103 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ | ||
104 | char *name; /* xattr name */ | ||
105 | size_t size; | ||
106 | char value[0]; | ||
107 | }; | ||
108 | |||
102 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 109 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ |
103 | enum sgp_type { | 110 | enum sgp_type { |
104 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 111 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
@@ -822,6 +829,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
822 | static void shmem_evict_inode(struct inode *inode) | 829 | static void shmem_evict_inode(struct inode *inode) |
823 | { | 830 | { |
824 | struct shmem_inode_info *info = SHMEM_I(inode); | 831 | struct shmem_inode_info *info = SHMEM_I(inode); |
832 | struct shmem_xattr *xattr, *nxattr; | ||
825 | 833 | ||
826 | if (inode->i_mapping->a_ops == &shmem_aops) { | 834 | if (inode->i_mapping->a_ops == &shmem_aops) { |
827 | truncate_inode_pages(inode->i_mapping, 0); | 835 | truncate_inode_pages(inode->i_mapping, 0); |
@@ -834,6 +842,11 @@ static void shmem_evict_inode(struct inode *inode) | |||
834 | mutex_unlock(&shmem_swaplist_mutex); | 842 | mutex_unlock(&shmem_swaplist_mutex); |
835 | } | 843 | } |
836 | } | 844 | } |
845 | |||
846 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { | ||
847 | kfree(xattr->name); | ||
848 | kfree(xattr); | ||
849 | } | ||
837 | BUG_ON(inode->i_blocks); | 850 | BUG_ON(inode->i_blocks); |
838 | shmem_free_inode(inode->i_sb); | 851 | shmem_free_inode(inode->i_sb); |
839 | end_writeback(inode); | 852 | end_writeback(inode); |
@@ -916,11 +929,12 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
916 | if (size > ENTRIES_PER_PAGE) | 929 | if (size > ENTRIES_PER_PAGE) |
917 | size = ENTRIES_PER_PAGE; | 930 | size = ENTRIES_PER_PAGE; |
918 | offset = shmem_find_swp(entry, ptr, ptr+size); | 931 | offset = shmem_find_swp(entry, ptr, ptr+size); |
932 | shmem_swp_unmap(ptr); | ||
919 | if (offset >= 0) { | 933 | if (offset >= 0) { |
920 | shmem_dir_unmap(dir); | 934 | shmem_dir_unmap(dir); |
935 | ptr = shmem_swp_map(subdir); | ||
921 | goto found; | 936 | goto found; |
922 | } | 937 | } |
923 | shmem_swp_unmap(ptr); | ||
924 | } | 938 | } |
925 | } | 939 | } |
926 | lost1: | 940 | lost1: |
@@ -1291,12 +1305,10 @@ repeat: | |||
1291 | swappage = lookup_swap_cache(swap); | 1305 | swappage = lookup_swap_cache(swap); |
1292 | if (!swappage) { | 1306 | if (!swappage) { |
1293 | shmem_swp_unmap(entry); | 1307 | shmem_swp_unmap(entry); |
1308 | spin_unlock(&info->lock); | ||
1294 | /* here we actually do the io */ | 1309 | /* here we actually do the io */ |
1295 | if (type && !(*type & VM_FAULT_MAJOR)) { | 1310 | if (type) |
1296 | __count_vm_event(PGMAJFAULT); | ||
1297 | *type |= VM_FAULT_MAJOR; | 1311 | *type |= VM_FAULT_MAJOR; |
1298 | } | ||
1299 | spin_unlock(&info->lock); | ||
1300 | swappage = shmem_swapin(swap, gfp, info, idx); | 1312 | swappage = shmem_swapin(swap, gfp, info, idx); |
1301 | if (!swappage) { | 1313 | if (!swappage) { |
1302 | spin_lock(&info->lock); | 1314 | spin_lock(&info->lock); |
@@ -1535,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1535 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1547 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1536 | if (error) | 1548 | if (error) |
1537 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1549 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1538 | 1550 | if (ret & VM_FAULT_MAJOR) { | |
1551 | count_vm_event(PGMAJFAULT); | ||
1552 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | ||
1553 | } | ||
1539 | return ret | VM_FAULT_LOCKED; | 1554 | return ret | VM_FAULT_LOCKED; |
1540 | } | 1555 | } |
1541 | 1556 | ||
@@ -1614,6 +1629,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1614 | spin_lock_init(&info->lock); | 1629 | spin_lock_init(&info->lock); |
1615 | info->flags = flags & VM_NORESERVE; | 1630 | info->flags = flags & VM_NORESERVE; |
1616 | INIT_LIST_HEAD(&info->swaplist); | 1631 | INIT_LIST_HEAD(&info->swaplist); |
1632 | INIT_LIST_HEAD(&info->xattr_list); | ||
1617 | cache_no_acl(inode); | 1633 | cache_no_acl(inode); |
1618 | 1634 | ||
1619 | switch (mode & S_IFMT) { | 1635 | switch (mode & S_IFMT) { |
@@ -2013,9 +2029,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2013 | 2029 | ||
2014 | info = SHMEM_I(inode); | 2030 | info = SHMEM_I(inode); |
2015 | inode->i_size = len-1; | 2031 | inode->i_size = len-1; |
2016 | if (len <= (char *)inode - (char *)info) { | 2032 | if (len <= SHMEM_SYMLINK_INLINE_LEN) { |
2017 | /* do it inline */ | 2033 | /* do it inline */ |
2018 | memcpy(info, symname, len); | 2034 | memcpy(info->inline_symlink, symname, len); |
2019 | inode->i_op = &shmem_symlink_inline_operations; | 2035 | inode->i_op = &shmem_symlink_inline_operations; |
2020 | } else { | 2036 | } else { |
2021 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); | 2037 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); |
@@ -2041,7 +2057,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2041 | 2057 | ||
2042 | static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) | 2058 | static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) |
2043 | { | 2059 | { |
2044 | nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); | 2060 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); |
2045 | return NULL; | 2061 | return NULL; |
2046 | } | 2062 | } |
2047 | 2063 | ||
@@ -2065,63 +2081,253 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co | |||
2065 | } | 2081 | } |
2066 | } | 2082 | } |
2067 | 2083 | ||
2068 | static const struct inode_operations shmem_symlink_inline_operations = { | 2084 | #ifdef CONFIG_TMPFS_XATTR |
2069 | .readlink = generic_readlink, | ||
2070 | .follow_link = shmem_follow_link_inline, | ||
2071 | }; | ||
2072 | |||
2073 | static const struct inode_operations shmem_symlink_inode_operations = { | ||
2074 | .readlink = generic_readlink, | ||
2075 | .follow_link = shmem_follow_link, | ||
2076 | .put_link = shmem_put_link, | ||
2077 | }; | ||
2078 | |||
2079 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2080 | /* | 2085 | /* |
2081 | * Superblocks without xattr inode operations will get security.* xattr | 2086 | * Superblocks without xattr inode operations may get some security.* xattr |
2082 | * support from the VFS "for free". As soon as we have any other xattrs | 2087 | * support from the LSM "for free". As soon as we have any other xattrs |
2083 | * like ACLs, we also need to implement the security.* handlers at | 2088 | * like ACLs, we also need to implement the security.* handlers at |
2084 | * filesystem level, though. | 2089 | * filesystem level, though. |
2085 | */ | 2090 | */ |
2086 | 2091 | ||
2087 | static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, | 2092 | static int shmem_xattr_get(struct dentry *dentry, const char *name, |
2088 | size_t list_len, const char *name, | 2093 | void *buffer, size_t size) |
2089 | size_t name_len, int handler_flags) | ||
2090 | { | 2094 | { |
2091 | return security_inode_listsecurity(dentry->d_inode, list, list_len); | 2095 | struct shmem_inode_info *info; |
2092 | } | 2096 | struct shmem_xattr *xattr; |
2097 | int ret = -ENODATA; | ||
2093 | 2098 | ||
2094 | static int shmem_xattr_security_get(struct dentry *dentry, const char *name, | 2099 | info = SHMEM_I(dentry->d_inode); |
2095 | void *buffer, size_t size, int handler_flags) | 2100 | |
2096 | { | 2101 | spin_lock(&info->lock); |
2097 | if (strcmp(name, "") == 0) | 2102 | list_for_each_entry(xattr, &info->xattr_list, list) { |
2098 | return -EINVAL; | 2103 | if (strcmp(name, xattr->name)) |
2099 | return xattr_getsecurity(dentry->d_inode, name, buffer, size); | 2104 | continue; |
2105 | |||
2106 | ret = xattr->size; | ||
2107 | if (buffer) { | ||
2108 | if (size < xattr->size) | ||
2109 | ret = -ERANGE; | ||
2110 | else | ||
2111 | memcpy(buffer, xattr->value, xattr->size); | ||
2112 | } | ||
2113 | break; | ||
2114 | } | ||
2115 | spin_unlock(&info->lock); | ||
2116 | return ret; | ||
2100 | } | 2117 | } |
2101 | 2118 | ||
2102 | static int shmem_xattr_security_set(struct dentry *dentry, const char *name, | 2119 | static int shmem_xattr_set(struct dentry *dentry, const char *name, |
2103 | const void *value, size_t size, int flags, int handler_flags) | 2120 | const void *value, size_t size, int flags) |
2104 | { | 2121 | { |
2105 | if (strcmp(name, "") == 0) | 2122 | struct inode *inode = dentry->d_inode; |
2106 | return -EINVAL; | 2123 | struct shmem_inode_info *info = SHMEM_I(inode); |
2107 | return security_inode_setsecurity(dentry->d_inode, name, value, | 2124 | struct shmem_xattr *xattr; |
2108 | size, flags); | 2125 | struct shmem_xattr *new_xattr = NULL; |
2126 | size_t len; | ||
2127 | int err = 0; | ||
2128 | |||
2129 | /* value == NULL means remove */ | ||
2130 | if (value) { | ||
2131 | /* wrap around? */ | ||
2132 | len = sizeof(*new_xattr) + size; | ||
2133 | if (len <= sizeof(*new_xattr)) | ||
2134 | return -ENOMEM; | ||
2135 | |||
2136 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
2137 | if (!new_xattr) | ||
2138 | return -ENOMEM; | ||
2139 | |||
2140 | new_xattr->name = kstrdup(name, GFP_KERNEL); | ||
2141 | if (!new_xattr->name) { | ||
2142 | kfree(new_xattr); | ||
2143 | return -ENOMEM; | ||
2144 | } | ||
2145 | |||
2146 | new_xattr->size = size; | ||
2147 | memcpy(new_xattr->value, value, size); | ||
2148 | } | ||
2149 | |||
2150 | spin_lock(&info->lock); | ||
2151 | list_for_each_entry(xattr, &info->xattr_list, list) { | ||
2152 | if (!strcmp(name, xattr->name)) { | ||
2153 | if (flags & XATTR_CREATE) { | ||
2154 | xattr = new_xattr; | ||
2155 | err = -EEXIST; | ||
2156 | } else if (new_xattr) { | ||
2157 | list_replace(&xattr->list, &new_xattr->list); | ||
2158 | } else { | ||
2159 | list_del(&xattr->list); | ||
2160 | } | ||
2161 | goto out; | ||
2162 | } | ||
2163 | } | ||
2164 | if (flags & XATTR_REPLACE) { | ||
2165 | xattr = new_xattr; | ||
2166 | err = -ENODATA; | ||
2167 | } else { | ||
2168 | list_add(&new_xattr->list, &info->xattr_list); | ||
2169 | xattr = NULL; | ||
2170 | } | ||
2171 | out: | ||
2172 | spin_unlock(&info->lock); | ||
2173 | if (xattr) | ||
2174 | kfree(xattr->name); | ||
2175 | kfree(xattr); | ||
2176 | return err; | ||
2109 | } | 2177 | } |
2110 | 2178 | ||
2111 | static const struct xattr_handler shmem_xattr_security_handler = { | ||
2112 | .prefix = XATTR_SECURITY_PREFIX, | ||
2113 | .list = shmem_xattr_security_list, | ||
2114 | .get = shmem_xattr_security_get, | ||
2115 | .set = shmem_xattr_security_set, | ||
2116 | }; | ||
2117 | 2179 | ||
2118 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 2180 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
2181 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2119 | &generic_acl_access_handler, | 2182 | &generic_acl_access_handler, |
2120 | &generic_acl_default_handler, | 2183 | &generic_acl_default_handler, |
2121 | &shmem_xattr_security_handler, | 2184 | #endif |
2122 | NULL | 2185 | NULL |
2123 | }; | 2186 | }; |
2187 | |||
2188 | static int shmem_xattr_validate(const char *name) | ||
2189 | { | ||
2190 | struct { const char *prefix; size_t len; } arr[] = { | ||
2191 | { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, | ||
2192 | { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } | ||
2193 | }; | ||
2194 | int i; | ||
2195 | |||
2196 | for (i = 0; i < ARRAY_SIZE(arr); i++) { | ||
2197 | size_t preflen = arr[i].len; | ||
2198 | if (strncmp(name, arr[i].prefix, preflen) == 0) { | ||
2199 | if (!name[preflen]) | ||
2200 | return -EINVAL; | ||
2201 | return 0; | ||
2202 | } | ||
2203 | } | ||
2204 | return -EOPNOTSUPP; | ||
2205 | } | ||
2206 | |||
2207 | static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, | ||
2208 | void *buffer, size_t size) | ||
2209 | { | ||
2210 | int err; | ||
2211 | |||
2212 | /* | ||
2213 | * If this is a request for a synthetic attribute in the system.* | ||
2214 | * namespace use the generic infrastructure to resolve a handler | ||
2215 | * for it via sb->s_xattr. | ||
2216 | */ | ||
2217 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
2218 | return generic_getxattr(dentry, name, buffer, size); | ||
2219 | |||
2220 | err = shmem_xattr_validate(name); | ||
2221 | if (err) | ||
2222 | return err; | ||
2223 | |||
2224 | return shmem_xattr_get(dentry, name, buffer, size); | ||
2225 | } | ||
2226 | |||
2227 | static int shmem_setxattr(struct dentry *dentry, const char *name, | ||
2228 | const void *value, size_t size, int flags) | ||
2229 | { | ||
2230 | int err; | ||
2231 | |||
2232 | /* | ||
2233 | * If this is a request for a synthetic attribute in the system.* | ||
2234 | * namespace use the generic infrastructure to resolve a handler | ||
2235 | * for it via sb->s_xattr. | ||
2236 | */ | ||
2237 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
2238 | return generic_setxattr(dentry, name, value, size, flags); | ||
2239 | |||
2240 | err = shmem_xattr_validate(name); | ||
2241 | if (err) | ||
2242 | return err; | ||
2243 | |||
2244 | if (size == 0) | ||
2245 | value = ""; /* empty EA, do not remove */ | ||
2246 | |||
2247 | return shmem_xattr_set(dentry, name, value, size, flags); | ||
2248 | |||
2249 | } | ||
2250 | |||
2251 | static int shmem_removexattr(struct dentry *dentry, const char *name) | ||
2252 | { | ||
2253 | int err; | ||
2254 | |||
2255 | /* | ||
2256 | * If this is a request for a synthetic attribute in the system.* | ||
2257 | * namespace use the generic infrastructure to resolve a handler | ||
2258 | * for it via sb->s_xattr. | ||
2259 | */ | ||
2260 | if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
2261 | return generic_removexattr(dentry, name); | ||
2262 | |||
2263 | err = shmem_xattr_validate(name); | ||
2264 | if (err) | ||
2265 | return err; | ||
2266 | |||
2267 | return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); | ||
2268 | } | ||
2269 | |||
2270 | static bool xattr_is_trusted(const char *name) | ||
2271 | { | ||
2272 | return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); | ||
2273 | } | ||
2274 | |||
2275 | static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) | ||
2276 | { | ||
2277 | bool trusted = capable(CAP_SYS_ADMIN); | ||
2278 | struct shmem_xattr *xattr; | ||
2279 | struct shmem_inode_info *info; | ||
2280 | size_t used = 0; | ||
2281 | |||
2282 | info = SHMEM_I(dentry->d_inode); | ||
2283 | |||
2284 | spin_lock(&info->lock); | ||
2285 | list_for_each_entry(xattr, &info->xattr_list, list) { | ||
2286 | size_t len; | ||
2287 | |||
2288 | /* skip "trusted." attributes for unprivileged callers */ | ||
2289 | if (!trusted && xattr_is_trusted(xattr->name)) | ||
2290 | continue; | ||
2291 | |||
2292 | len = strlen(xattr->name) + 1; | ||
2293 | used += len; | ||
2294 | if (buffer) { | ||
2295 | if (size < used) { | ||
2296 | used = -ERANGE; | ||
2297 | break; | ||
2298 | } | ||
2299 | memcpy(buffer, xattr->name, len); | ||
2300 | buffer += len; | ||
2301 | } | ||
2302 | } | ||
2303 | spin_unlock(&info->lock); | ||
2304 | |||
2305 | return used; | ||
2306 | } | ||
2307 | #endif /* CONFIG_TMPFS_XATTR */ | ||
2308 | |||
2309 | static const struct inode_operations shmem_symlink_inline_operations = { | ||
2310 | .readlink = generic_readlink, | ||
2311 | .follow_link = shmem_follow_link_inline, | ||
2312 | #ifdef CONFIG_TMPFS_XATTR | ||
2313 | .setxattr = shmem_setxattr, | ||
2314 | .getxattr = shmem_getxattr, | ||
2315 | .listxattr = shmem_listxattr, | ||
2316 | .removexattr = shmem_removexattr, | ||
2124 | #endif | 2317 | #endif |
2318 | }; | ||
2319 | |||
2320 | static const struct inode_operations shmem_symlink_inode_operations = { | ||
2321 | .readlink = generic_readlink, | ||
2322 | .follow_link = shmem_follow_link, | ||
2323 | .put_link = shmem_put_link, | ||
2324 | #ifdef CONFIG_TMPFS_XATTR | ||
2325 | .setxattr = shmem_setxattr, | ||
2326 | .getxattr = shmem_getxattr, | ||
2327 | .listxattr = shmem_listxattr, | ||
2328 | .removexattr = shmem_removexattr, | ||
2329 | #endif | ||
2330 | }; | ||
2125 | 2331 | ||
2126 | static struct dentry *shmem_get_parent(struct dentry *child) | 2332 | static struct dentry *shmem_get_parent(struct dentry *child) |
2127 | { | 2333 | { |
@@ -2401,8 +2607,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2401 | sb->s_magic = TMPFS_MAGIC; | 2607 | sb->s_magic = TMPFS_MAGIC; |
2402 | sb->s_op = &shmem_ops; | 2608 | sb->s_op = &shmem_ops; |
2403 | sb->s_time_gran = 1; | 2609 | sb->s_time_gran = 1; |
2404 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2610 | #ifdef CONFIG_TMPFS_XATTR |
2405 | sb->s_xattr = shmem_xattr_handlers; | 2611 | sb->s_xattr = shmem_xattr_handlers; |
2612 | #endif | ||
2613 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2406 | sb->s_flags |= MS_POSIXACL; | 2614 | sb->s_flags |= MS_POSIXACL; |
2407 | #endif | 2615 | #endif |
2408 | 2616 | ||
@@ -2500,11 +2708,13 @@ static const struct file_operations shmem_file_operations = { | |||
2500 | static const struct inode_operations shmem_inode_operations = { | 2708 | static const struct inode_operations shmem_inode_operations = { |
2501 | .setattr = shmem_notify_change, | 2709 | .setattr = shmem_notify_change, |
2502 | .truncate_range = shmem_truncate_range, | 2710 | .truncate_range = shmem_truncate_range, |
2711 | #ifdef CONFIG_TMPFS_XATTR | ||
2712 | .setxattr = shmem_setxattr, | ||
2713 | .getxattr = shmem_getxattr, | ||
2714 | .listxattr = shmem_listxattr, | ||
2715 | .removexattr = shmem_removexattr, | ||
2716 | #endif | ||
2503 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2717 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2504 | .setxattr = generic_setxattr, | ||
2505 | .getxattr = generic_getxattr, | ||
2506 | .listxattr = generic_listxattr, | ||
2507 | .removexattr = generic_removexattr, | ||
2508 | .check_acl = generic_check_acl, | 2718 | .check_acl = generic_check_acl, |
2509 | #endif | 2719 | #endif |
2510 | 2720 | ||
@@ -2522,23 +2732,27 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2522 | .mknod = shmem_mknod, | 2732 | .mknod = shmem_mknod, |
2523 | .rename = shmem_rename, | 2733 | .rename = shmem_rename, |
2524 | #endif | 2734 | #endif |
2735 | #ifdef CONFIG_TMPFS_XATTR | ||
2736 | .setxattr = shmem_setxattr, | ||
2737 | .getxattr = shmem_getxattr, | ||
2738 | .listxattr = shmem_listxattr, | ||
2739 | .removexattr = shmem_removexattr, | ||
2740 | #endif | ||
2525 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2741 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2526 | .setattr = shmem_notify_change, | 2742 | .setattr = shmem_notify_change, |
2527 | .setxattr = generic_setxattr, | ||
2528 | .getxattr = generic_getxattr, | ||
2529 | .listxattr = generic_listxattr, | ||
2530 | .removexattr = generic_removexattr, | ||
2531 | .check_acl = generic_check_acl, | 2743 | .check_acl = generic_check_acl, |
2532 | #endif | 2744 | #endif |
2533 | }; | 2745 | }; |
2534 | 2746 | ||
2535 | static const struct inode_operations shmem_special_inode_operations = { | 2747 | static const struct inode_operations shmem_special_inode_operations = { |
2748 | #ifdef CONFIG_TMPFS_XATTR | ||
2749 | .setxattr = shmem_setxattr, | ||
2750 | .getxattr = shmem_getxattr, | ||
2751 | .listxattr = shmem_listxattr, | ||
2752 | .removexattr = shmem_removexattr, | ||
2753 | #endif | ||
2536 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2754 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2537 | .setattr = shmem_notify_change, | 2755 | .setattr = shmem_notify_change, |
2538 | .setxattr = generic_setxattr, | ||
2539 | .getxattr = generic_getxattr, | ||
2540 | .listxattr = generic_listxattr, | ||
2541 | .removexattr = generic_removexattr, | ||
2542 | .check_acl = generic_check_acl, | 2756 | .check_acl = generic_check_acl, |
2543 | #endif | 2757 | #endif |
2544 | }; | 2758 | }; |
@@ -115,6 +115,7 @@ | |||
115 | #include <linux/debugobjects.h> | 115 | #include <linux/debugobjects.h> |
116 | #include <linux/kmemcheck.h> | 116 | #include <linux/kmemcheck.h> |
117 | #include <linux/memory.h> | 117 | #include <linux/memory.h> |
118 | #include <linux/prefetch.h> | ||
118 | 119 | ||
119 | #include <asm/cacheflush.h> | 120 | #include <asm/cacheflush.h> |
120 | #include <asm/tlbflush.h> | 121 | #include <asm/tlbflush.h> |
@@ -261,6 +261,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) | |||
261 | return *(void **)(object + s->offset); | 261 | return *(void **)(object + s->offset); |
262 | } | 262 | } |
263 | 263 | ||
264 | static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) | ||
265 | { | ||
266 | void *p; | ||
267 | |||
268 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
269 | probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); | ||
270 | #else | ||
271 | p = get_freepointer(s, object); | ||
272 | #endif | ||
273 | return p; | ||
274 | } | ||
275 | |||
264 | static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | 276 | static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) |
265 | { | 277 | { |
266 | *(void **)(object + s->offset) = fp; | 278 | *(void **)(object + s->offset) = fp; |
@@ -271,10 +283,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | |||
271 | for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ | 283 | for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ |
272 | __p += (__s)->size) | 284 | __p += (__s)->size) |
273 | 285 | ||
274 | /* Scan freelist */ | ||
275 | #define for_each_free_object(__p, __s, __free) \ | ||
276 | for (__p = (__free); __p; __p = get_freepointer((__s), __p)) | ||
277 | |||
278 | /* Determine object index from a given position */ | 286 | /* Determine object index from a given position */ |
279 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | 287 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) |
280 | { | 288 | { |
@@ -332,6 +340,21 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
332 | 340 | ||
333 | #ifdef CONFIG_SLUB_DEBUG | 341 | #ifdef CONFIG_SLUB_DEBUG |
334 | /* | 342 | /* |
343 | * Determine a map of object in use on a page. | ||
344 | * | ||
345 | * Slab lock or node listlock must be held to guarantee that the page does | ||
346 | * not vanish from under us. | ||
347 | */ | ||
348 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | ||
349 | { | ||
350 | void *p; | ||
351 | void *addr = page_address(page); | ||
352 | |||
353 | for (p = page->freelist; p; p = get_freepointer(s, p)) | ||
354 | set_bit(slab_index(p, s, addr), map); | ||
355 | } | ||
356 | |||
357 | /* | ||
335 | * Debug settings: | 358 | * Debug settings: |
336 | */ | 359 | */ |
337 | #ifdef CONFIG_SLUB_DEBUG_ON | 360 | #ifdef CONFIG_SLUB_DEBUG_ON |
@@ -1487,7 +1510,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1487 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; | 1510 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; |
1488 | 1511 | ||
1489 | page = get_partial_node(get_node(s, searchnode)); | 1512 | page = get_partial_node(get_node(s, searchnode)); |
1490 | if (page || node != -1) | 1513 | if (page || node != NUMA_NO_NODE) |
1491 | return page; | 1514 | return page; |
1492 | 1515 | ||
1493 | return get_any_partial(s, flags); | 1516 | return get_any_partial(s, flags); |
@@ -1540,7 +1563,6 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
1540 | } | 1563 | } |
1541 | } | 1564 | } |
1542 | 1565 | ||
1543 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1544 | #ifdef CONFIG_PREEMPT | 1566 | #ifdef CONFIG_PREEMPT |
1545 | /* | 1567 | /* |
1546 | * Calculate the next globally unique transaction for disambiguiation | 1568 | * Calculate the next globally unique transaction for disambiguiation |
@@ -1600,17 +1622,12 @@ static inline void note_cmpxchg_failure(const char *n, | |||
1600 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); | 1622 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); |
1601 | } | 1623 | } |
1602 | 1624 | ||
1603 | #endif | ||
1604 | |||
1605 | void init_kmem_cache_cpus(struct kmem_cache *s) | 1625 | void init_kmem_cache_cpus(struct kmem_cache *s) |
1606 | { | 1626 | { |
1607 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1608 | int cpu; | 1627 | int cpu; |
1609 | 1628 | ||
1610 | for_each_possible_cpu(cpu) | 1629 | for_each_possible_cpu(cpu) |
1611 | per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); | 1630 | per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); |
1612 | #endif | ||
1613 | |||
1614 | } | 1631 | } |
1615 | /* | 1632 | /* |
1616 | * Remove the cpu slab | 1633 | * Remove the cpu slab |
@@ -1643,9 +1660,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
1643 | page->inuse--; | 1660 | page->inuse--; |
1644 | } | 1661 | } |
1645 | c->page = NULL; | 1662 | c->page = NULL; |
1646 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1647 | c->tid = next_tid(c->tid); | 1663 | c->tid = next_tid(c->tid); |
1648 | #endif | ||
1649 | unfreeze_slab(s, page, tail); | 1664 | unfreeze_slab(s, page, tail); |
1650 | } | 1665 | } |
1651 | 1666 | ||
@@ -1779,8 +1794,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1779 | unsigned long addr, struct kmem_cache_cpu *c) | 1794 | unsigned long addr, struct kmem_cache_cpu *c) |
1780 | { | 1795 | { |
1781 | void **object; | 1796 | void **object; |
1782 | struct page *new; | 1797 | struct page *page; |
1783 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1784 | unsigned long flags; | 1798 | unsigned long flags; |
1785 | 1799 | ||
1786 | local_irq_save(flags); | 1800 | local_irq_save(flags); |
@@ -1792,37 +1806,34 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1792 | */ | 1806 | */ |
1793 | c = this_cpu_ptr(s->cpu_slab); | 1807 | c = this_cpu_ptr(s->cpu_slab); |
1794 | #endif | 1808 | #endif |
1795 | #endif | ||
1796 | 1809 | ||
1797 | /* We handle __GFP_ZERO in the caller */ | 1810 | /* We handle __GFP_ZERO in the caller */ |
1798 | gfpflags &= ~__GFP_ZERO; | 1811 | gfpflags &= ~__GFP_ZERO; |
1799 | 1812 | ||
1800 | if (!c->page) | 1813 | page = c->page; |
1814 | if (!page) | ||
1801 | goto new_slab; | 1815 | goto new_slab; |
1802 | 1816 | ||
1803 | slab_lock(c->page); | 1817 | slab_lock(page); |
1804 | if (unlikely(!node_match(c, node))) | 1818 | if (unlikely(!node_match(c, node))) |
1805 | goto another_slab; | 1819 | goto another_slab; |
1806 | 1820 | ||
1807 | stat(s, ALLOC_REFILL); | 1821 | stat(s, ALLOC_REFILL); |
1808 | 1822 | ||
1809 | load_freelist: | 1823 | load_freelist: |
1810 | object = c->page->freelist; | 1824 | object = page->freelist; |
1811 | if (unlikely(!object)) | 1825 | if (unlikely(!object)) |
1812 | goto another_slab; | 1826 | goto another_slab; |
1813 | if (kmem_cache_debug(s)) | 1827 | if (kmem_cache_debug(s)) |
1814 | goto debug; | 1828 | goto debug; |
1815 | 1829 | ||
1816 | c->freelist = get_freepointer(s, object); | 1830 | c->freelist = get_freepointer(s, object); |
1817 | c->page->inuse = c->page->objects; | 1831 | page->inuse = page->objects; |
1818 | c->page->freelist = NULL; | 1832 | page->freelist = NULL; |
1819 | c->node = page_to_nid(c->page); | 1833 | |
1820 | unlock_out: | 1834 | slab_unlock(page); |
1821 | slab_unlock(c->page); | ||
1822 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1823 | c->tid = next_tid(c->tid); | 1835 | c->tid = next_tid(c->tid); |
1824 | local_irq_restore(flags); | 1836 | local_irq_restore(flags); |
1825 | #endif | ||
1826 | stat(s, ALLOC_SLOWPATH); | 1837 | stat(s, ALLOC_SLOWPATH); |
1827 | return object; | 1838 | return object; |
1828 | 1839 | ||
@@ -1830,10 +1841,11 @@ another_slab: | |||
1830 | deactivate_slab(s, c); | 1841 | deactivate_slab(s, c); |
1831 | 1842 | ||
1832 | new_slab: | 1843 | new_slab: |
1833 | new = get_partial(s, gfpflags, node); | 1844 | page = get_partial(s, gfpflags, node); |
1834 | if (new) { | 1845 | if (page) { |
1835 | c->page = new; | ||
1836 | stat(s, ALLOC_FROM_PARTIAL); | 1846 | stat(s, ALLOC_FROM_PARTIAL); |
1847 | c->node = page_to_nid(page); | ||
1848 | c->page = page; | ||
1837 | goto load_freelist; | 1849 | goto load_freelist; |
1838 | } | 1850 | } |
1839 | 1851 | ||
@@ -1841,35 +1853,38 @@ new_slab: | |||
1841 | if (gfpflags & __GFP_WAIT) | 1853 | if (gfpflags & __GFP_WAIT) |
1842 | local_irq_enable(); | 1854 | local_irq_enable(); |
1843 | 1855 | ||
1844 | new = new_slab(s, gfpflags, node); | 1856 | page = new_slab(s, gfpflags, node); |
1845 | 1857 | ||
1846 | if (gfpflags & __GFP_WAIT) | 1858 | if (gfpflags & __GFP_WAIT) |
1847 | local_irq_disable(); | 1859 | local_irq_disable(); |
1848 | 1860 | ||
1849 | if (new) { | 1861 | if (page) { |
1850 | c = __this_cpu_ptr(s->cpu_slab); | 1862 | c = __this_cpu_ptr(s->cpu_slab); |
1851 | stat(s, ALLOC_SLAB); | 1863 | stat(s, ALLOC_SLAB); |
1852 | if (c->page) | 1864 | if (c->page) |
1853 | flush_slab(s, c); | 1865 | flush_slab(s, c); |
1854 | slab_lock(new); | 1866 | |
1855 | __SetPageSlubFrozen(new); | 1867 | slab_lock(page); |
1856 | c->page = new; | 1868 | __SetPageSlubFrozen(page); |
1869 | c->node = page_to_nid(page); | ||
1870 | c->page = page; | ||
1857 | goto load_freelist; | 1871 | goto load_freelist; |
1858 | } | 1872 | } |
1859 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 1873 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) |
1860 | slab_out_of_memory(s, gfpflags, node); | 1874 | slab_out_of_memory(s, gfpflags, node); |
1861 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1862 | local_irq_restore(flags); | 1875 | local_irq_restore(flags); |
1863 | #endif | ||
1864 | return NULL; | 1876 | return NULL; |
1865 | debug: | 1877 | debug: |
1866 | if (!alloc_debug_processing(s, c->page, object, addr)) | 1878 | if (!alloc_debug_processing(s, page, object, addr)) |
1867 | goto another_slab; | 1879 | goto another_slab; |
1868 | 1880 | ||
1869 | c->page->inuse++; | 1881 | page->inuse++; |
1870 | c->page->freelist = get_freepointer(s, object); | 1882 | page->freelist = get_freepointer(s, object); |
1883 | deactivate_slab(s, c); | ||
1884 | c->page = NULL; | ||
1871 | c->node = NUMA_NO_NODE; | 1885 | c->node = NUMA_NO_NODE; |
1872 | goto unlock_out; | 1886 | local_irq_restore(flags); |
1887 | return object; | ||
1873 | } | 1888 | } |
1874 | 1889 | ||
1875 | /* | 1890 | /* |
@@ -1887,20 +1902,12 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1887 | { | 1902 | { |
1888 | void **object; | 1903 | void **object; |
1889 | struct kmem_cache_cpu *c; | 1904 | struct kmem_cache_cpu *c; |
1890 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1891 | unsigned long tid; | 1905 | unsigned long tid; |
1892 | #else | ||
1893 | unsigned long flags; | ||
1894 | #endif | ||
1895 | 1906 | ||
1896 | if (slab_pre_alloc_hook(s, gfpflags)) | 1907 | if (slab_pre_alloc_hook(s, gfpflags)) |
1897 | return NULL; | 1908 | return NULL; |
1898 | 1909 | ||
1899 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
1900 | local_irq_save(flags); | ||
1901 | #else | ||
1902 | redo: | 1910 | redo: |
1903 | #endif | ||
1904 | 1911 | ||
1905 | /* | 1912 | /* |
1906 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is | 1913 | * Must read kmem_cache cpu data via this cpu ptr. Preemption is |
@@ -1910,7 +1917,6 @@ redo: | |||
1910 | */ | 1917 | */ |
1911 | c = __this_cpu_ptr(s->cpu_slab); | 1918 | c = __this_cpu_ptr(s->cpu_slab); |
1912 | 1919 | ||
1913 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1914 | /* | 1920 | /* |
1915 | * The transaction ids are globally unique per cpu and per operation on | 1921 | * The transaction ids are globally unique per cpu and per operation on |
1916 | * a per cpu queue. Thus they can be guarantee that the cmpxchg_double | 1922 | * a per cpu queue. Thus they can be guarantee that the cmpxchg_double |
@@ -1919,7 +1925,6 @@ redo: | |||
1919 | */ | 1925 | */ |
1920 | tid = c->tid; | 1926 | tid = c->tid; |
1921 | barrier(); | 1927 | barrier(); |
1922 | #endif | ||
1923 | 1928 | ||
1924 | object = c->freelist; | 1929 | object = c->freelist; |
1925 | if (unlikely(!object || !node_match(c, node))) | 1930 | if (unlikely(!object || !node_match(c, node))) |
@@ -1927,7 +1932,6 @@ redo: | |||
1927 | object = __slab_alloc(s, gfpflags, node, addr, c); | 1932 | object = __slab_alloc(s, gfpflags, node, addr, c); |
1928 | 1933 | ||
1929 | else { | 1934 | else { |
1930 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
1931 | /* | 1935 | /* |
1932 | * The cmpxchg will only match if there was no additional | 1936 | * The cmpxchg will only match if there was no additional |
1933 | * operation and if we are on the right processor. | 1937 | * operation and if we are on the right processor. |
@@ -1943,21 +1947,14 @@ redo: | |||
1943 | if (unlikely(!irqsafe_cpu_cmpxchg_double( | 1947 | if (unlikely(!irqsafe_cpu_cmpxchg_double( |
1944 | s->cpu_slab->freelist, s->cpu_slab->tid, | 1948 | s->cpu_slab->freelist, s->cpu_slab->tid, |
1945 | object, tid, | 1949 | object, tid, |
1946 | get_freepointer(s, object), next_tid(tid)))) { | 1950 | get_freepointer_safe(s, object), next_tid(tid)))) { |
1947 | 1951 | ||
1948 | note_cmpxchg_failure("slab_alloc", s, tid); | 1952 | note_cmpxchg_failure("slab_alloc", s, tid); |
1949 | goto redo; | 1953 | goto redo; |
1950 | } | 1954 | } |
1951 | #else | ||
1952 | c->freelist = get_freepointer(s, object); | ||
1953 | #endif | ||
1954 | stat(s, ALLOC_FASTPATH); | 1955 | stat(s, ALLOC_FASTPATH); |
1955 | } | 1956 | } |
1956 | 1957 | ||
1957 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
1958 | local_irq_restore(flags); | ||
1959 | #endif | ||
1960 | |||
1961 | if (unlikely(gfpflags & __GFP_ZERO) && object) | 1958 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
1962 | memset(object, 0, s->objsize); | 1959 | memset(object, 0, s->objsize); |
1963 | 1960 | ||
@@ -2034,18 +2031,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2034 | { | 2031 | { |
2035 | void *prior; | 2032 | void *prior; |
2036 | void **object = (void *)x; | 2033 | void **object = (void *)x; |
2037 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2038 | unsigned long flags; | 2034 | unsigned long flags; |
2039 | 2035 | ||
2040 | local_irq_save(flags); | 2036 | local_irq_save(flags); |
2041 | #endif | ||
2042 | slab_lock(page); | 2037 | slab_lock(page); |
2043 | stat(s, FREE_SLOWPATH); | 2038 | stat(s, FREE_SLOWPATH); |
2044 | 2039 | ||
2045 | if (kmem_cache_debug(s)) | 2040 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) |
2046 | goto debug; | 2041 | goto out_unlock; |
2047 | 2042 | ||
2048 | checks_ok: | ||
2049 | prior = page->freelist; | 2043 | prior = page->freelist; |
2050 | set_freepointer(s, object, prior); | 2044 | set_freepointer(s, object, prior); |
2051 | page->freelist = object; | 2045 | page->freelist = object; |
@@ -2070,9 +2064,7 @@ checks_ok: | |||
2070 | 2064 | ||
2071 | out_unlock: | 2065 | out_unlock: |
2072 | slab_unlock(page); | 2066 | slab_unlock(page); |
2073 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2074 | local_irq_restore(flags); | 2067 | local_irq_restore(flags); |
2075 | #endif | ||
2076 | return; | 2068 | return; |
2077 | 2069 | ||
2078 | slab_empty: | 2070 | slab_empty: |
@@ -2084,17 +2076,9 @@ slab_empty: | |||
2084 | stat(s, FREE_REMOVE_PARTIAL); | 2076 | stat(s, FREE_REMOVE_PARTIAL); |
2085 | } | 2077 | } |
2086 | slab_unlock(page); | 2078 | slab_unlock(page); |
2087 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2088 | local_irq_restore(flags); | 2079 | local_irq_restore(flags); |
2089 | #endif | ||
2090 | stat(s, FREE_SLAB); | 2080 | stat(s, FREE_SLAB); |
2091 | discard_slab(s, page); | 2081 | discard_slab(s, page); |
2092 | return; | ||
2093 | |||
2094 | debug: | ||
2095 | if (!free_debug_processing(s, page, x, addr)) | ||
2096 | goto out_unlock; | ||
2097 | goto checks_ok; | ||
2098 | } | 2082 | } |
2099 | 2083 | ||
2100 | /* | 2084 | /* |
@@ -2113,20 +2097,11 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
2113 | { | 2097 | { |
2114 | void **object = (void *)x; | 2098 | void **object = (void *)x; |
2115 | struct kmem_cache_cpu *c; | 2099 | struct kmem_cache_cpu *c; |
2116 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2117 | unsigned long tid; | 2100 | unsigned long tid; |
2118 | #else | ||
2119 | unsigned long flags; | ||
2120 | #endif | ||
2121 | 2101 | ||
2122 | slab_free_hook(s, x); | 2102 | slab_free_hook(s, x); |
2123 | 2103 | ||
2124 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
2125 | local_irq_save(flags); | ||
2126 | |||
2127 | #else | ||
2128 | redo: | 2104 | redo: |
2129 | #endif | ||
2130 | 2105 | ||
2131 | /* | 2106 | /* |
2132 | * Determine the currently cpus per cpu slab. | 2107 | * Determine the currently cpus per cpu slab. |
@@ -2136,15 +2111,12 @@ redo: | |||
2136 | */ | 2111 | */ |
2137 | c = __this_cpu_ptr(s->cpu_slab); | 2112 | c = __this_cpu_ptr(s->cpu_slab); |
2138 | 2113 | ||
2139 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2140 | tid = c->tid; | 2114 | tid = c->tid; |
2141 | barrier(); | 2115 | barrier(); |
2142 | #endif | ||
2143 | 2116 | ||
2144 | if (likely(page == c->page && c->node != NUMA_NO_NODE)) { | 2117 | if (likely(page == c->page)) { |
2145 | set_freepointer(s, object, c->freelist); | 2118 | set_freepointer(s, object, c->freelist); |
2146 | 2119 | ||
2147 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2148 | if (unlikely(!irqsafe_cpu_cmpxchg_double( | 2120 | if (unlikely(!irqsafe_cpu_cmpxchg_double( |
2149 | s->cpu_slab->freelist, s->cpu_slab->tid, | 2121 | s->cpu_slab->freelist, s->cpu_slab->tid, |
2150 | c->freelist, tid, | 2122 | c->freelist, tid, |
@@ -2153,16 +2125,10 @@ redo: | |||
2153 | note_cmpxchg_failure("slab_free", s, tid); | 2125 | note_cmpxchg_failure("slab_free", s, tid); |
2154 | goto redo; | 2126 | goto redo; |
2155 | } | 2127 | } |
2156 | #else | ||
2157 | c->freelist = object; | ||
2158 | #endif | ||
2159 | stat(s, FREE_FASTPATH); | 2128 | stat(s, FREE_FASTPATH); |
2160 | } else | 2129 | } else |
2161 | __slab_free(s, page, x, addr); | 2130 | __slab_free(s, page, x, addr); |
2162 | 2131 | ||
2163 | #ifndef CONFIG_CMPXCHG_LOCAL | ||
2164 | local_irq_restore(flags); | ||
2165 | #endif | ||
2166 | } | 2132 | } |
2167 | 2133 | ||
2168 | void kmem_cache_free(struct kmem_cache *s, void *x) | 2134 | void kmem_cache_free(struct kmem_cache *s, void *x) |
@@ -2673,9 +2639,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
2673 | return; | 2639 | return; |
2674 | slab_err(s, page, "%s", text); | 2640 | slab_err(s, page, "%s", text); |
2675 | slab_lock(page); | 2641 | slab_lock(page); |
2676 | for_each_free_object(p, s, page->freelist) | ||
2677 | set_bit(slab_index(p, s, addr), map); | ||
2678 | 2642 | ||
2643 | get_map(s, page, map); | ||
2679 | for_each_object(p, s, addr, page->objects) { | 2644 | for_each_object(p, s, addr, page->objects) { |
2680 | 2645 | ||
2681 | if (!test_bit(slab_index(p, s, addr), map)) { | 2646 | if (!test_bit(slab_index(p, s, addr), map)) { |
@@ -3203,7 +3168,7 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) | |||
3203 | list_for_each_entry(p, &n->partial, lru) | 3168 | list_for_each_entry(p, &n->partial, lru) |
3204 | p->slab = s; | 3169 | p->slab = s; |
3205 | 3170 | ||
3206 | #ifdef CONFIG_SLAB_DEBUG | 3171 | #ifdef CONFIG_SLUB_DEBUG |
3207 | list_for_each_entry(p, &n->full, lru) | 3172 | list_for_each_entry(p, &n->full, lru) |
3208 | p->slab = s; | 3173 | p->slab = s; |
3209 | #endif | 3174 | #endif |
@@ -3610,10 +3575,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3610 | /* Now we know that a valid freelist exists */ | 3575 | /* Now we know that a valid freelist exists */ |
3611 | bitmap_zero(map, page->objects); | 3576 | bitmap_zero(map, page->objects); |
3612 | 3577 | ||
3613 | for_each_free_object(p, s, page->freelist) { | 3578 | get_map(s, page, map); |
3614 | set_bit(slab_index(p, s, addr), map); | 3579 | for_each_object(p, s, addr, page->objects) { |
3615 | if (!check_object(s, page, p, SLUB_RED_INACTIVE)) | 3580 | if (test_bit(slab_index(p, s, addr), map)) |
3616 | return 0; | 3581 | if (!check_object(s, page, p, SLUB_RED_INACTIVE)) |
3582 | return 0; | ||
3617 | } | 3583 | } |
3618 | 3584 | ||
3619 | for_each_object(p, s, addr, page->objects) | 3585 | for_each_object(p, s, addr, page->objects) |
@@ -3821,8 +3787,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, | |||
3821 | void *p; | 3787 | void *p; |
3822 | 3788 | ||
3823 | bitmap_zero(map, page->objects); | 3789 | bitmap_zero(map, page->objects); |
3824 | for_each_free_object(p, s, page->freelist) | 3790 | get_map(s, page, map); |
3825 | set_bit(slab_index(p, s, addr), map); | ||
3826 | 3791 | ||
3827 | for_each_object(p, s, addr, page->objects) | 3792 | for_each_object(p, s, addr, page->objects) |
3828 | if (!test_bit(slab_index(p, s, addr), map)) | 3793 | if (!test_bit(slab_index(p, s, addr), map)) |
@@ -272,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, | |||
272 | memcg_reclaim_stat->recent_rotated[file]++; | 272 | memcg_reclaim_stat->recent_rotated[file]++; |
273 | } | 273 | } |
274 | 274 | ||
275 | /* | 275 | static void __activate_page(struct page *page, void *arg) |
276 | * FIXME: speed this up? | ||
277 | */ | ||
278 | void activate_page(struct page *page) | ||
279 | { | 276 | { |
280 | struct zone *zone = page_zone(page); | 277 | struct zone *zone = page_zone(page); |
281 | 278 | ||
282 | spin_lock_irq(&zone->lru_lock); | ||
283 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 279 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
284 | int file = page_is_file_cache(page); | 280 | int file = page_is_file_cache(page); |
285 | int lru = page_lru_base_type(page); | 281 | int lru = page_lru_base_type(page); |
@@ -292,8 +288,45 @@ void activate_page(struct page *page) | |||
292 | 288 | ||
293 | update_page_reclaim_stat(zone, page, file, 1); | 289 | update_page_reclaim_stat(zone, page, file, 1); |
294 | } | 290 | } |
291 | } | ||
292 | |||
293 | #ifdef CONFIG_SMP | ||
294 | static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); | ||
295 | |||
296 | static void activate_page_drain(int cpu) | ||
297 | { | ||
298 | struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); | ||
299 | |||
300 | if (pagevec_count(pvec)) | ||
301 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
302 | } | ||
303 | |||
304 | void activate_page(struct page *page) | ||
305 | { | ||
306 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | ||
307 | struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); | ||
308 | |||
309 | page_cache_get(page); | ||
310 | if (!pagevec_add(pvec, page)) | ||
311 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | ||
312 | put_cpu_var(activate_page_pvecs); | ||
313 | } | ||
314 | } | ||
315 | |||
316 | #else | ||
317 | static inline void activate_page_drain(int cpu) | ||
318 | { | ||
319 | } | ||
320 | |||
321 | void activate_page(struct page *page) | ||
322 | { | ||
323 | struct zone *zone = page_zone(page); | ||
324 | |||
325 | spin_lock_irq(&zone->lru_lock); | ||
326 | __activate_page(page, NULL); | ||
295 | spin_unlock_irq(&zone->lru_lock); | 327 | spin_unlock_irq(&zone->lru_lock); |
296 | } | 328 | } |
329 | #endif | ||
297 | 330 | ||
298 | /* | 331 | /* |
299 | * Mark a page as having seen activity. | 332 | * Mark a page as having seen activity. |
@@ -464,6 +497,8 @@ static void drain_cpu_pagevecs(int cpu) | |||
464 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | 497 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); |
465 | if (pagevec_count(pvec)) | 498 | if (pagevec_count(pvec)) |
466 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 499 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); |
500 | |||
501 | activate_page_drain(cpu); | ||
467 | } | 502 | } |
468 | 503 | ||
469 | /** | 504 | /** |
@@ -476,6 +511,13 @@ static void drain_cpu_pagevecs(int cpu) | |||
476 | */ | 511 | */ |
477 | void deactivate_page(struct page *page) | 512 | void deactivate_page(struct page *page) |
478 | { | 513 | { |
514 | /* | ||
515 | * In a workload with many unevictable page such as mprotect, unevictable | ||
516 | * page deactivation for accelerating reclaim is pointless. | ||
517 | */ | ||
518 | if (PageUnevictable(page)) | ||
519 | return; | ||
520 | |||
479 | if (likely(get_page_unless_zero(page))) { | 521 | if (likely(get_page_unless_zero(page))) { |
480 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | 522 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); |
481 | 523 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 8c6b3ce38f09..d537d29e9b7b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
32 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
33 | #include <linux/poll.h> | 33 | #include <linux/poll.h> |
34 | #include <linux/oom.h> | ||
34 | 35 | ||
35 | #include <asm/pgtable.h> | 36 | #include <asm/pgtable.h> |
36 | #include <asm/tlbflush.h> | 37 | #include <asm/tlbflush.h> |
@@ -1555,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1555 | struct address_space *mapping; | 1556 | struct address_space *mapping; |
1556 | struct inode *inode; | 1557 | struct inode *inode; |
1557 | char *pathname; | 1558 | char *pathname; |
1559 | int oom_score_adj; | ||
1558 | int i, type, prev; | 1560 | int i, type, prev; |
1559 | int err; | 1561 | int err; |
1560 | 1562 | ||
@@ -1613,9 +1615,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1613 | p->flags &= ~SWP_WRITEOK; | 1615 | p->flags &= ~SWP_WRITEOK; |
1614 | spin_unlock(&swap_lock); | 1616 | spin_unlock(&swap_lock); |
1615 | 1617 | ||
1616 | current->flags |= PF_OOM_ORIGIN; | 1618 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
1617 | err = try_to_unuse(type); | 1619 | err = try_to_unuse(type); |
1618 | current->flags &= ~PF_OOM_ORIGIN; | 1620 | test_set_oom_score_adj(oom_score_adj); |
1619 | 1621 | ||
1620 | if (err) { | 1622 | if (err) { |
1621 | /* | 1623 | /* |
diff --git a/mm/truncate.c b/mm/truncate.c index a95667529135..3a29a6180212 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/task_io_accounting_ops.h> | 19 | #include <linux/task_io_accounting_ops.h> |
20 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | 20 | #include <linux/buffer_head.h> /* grr. try_to_release_page, |
21 | do_invalidatepage */ | 21 | do_invalidatepage */ |
22 | #include <linux/cleancache.h> | ||
22 | #include "internal.h" | 23 | #include "internal.h" |
23 | 24 | ||
24 | 25 | ||
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) | |||
51 | static inline void truncate_partial_page(struct page *page, unsigned partial) | 52 | static inline void truncate_partial_page(struct page *page, unsigned partial) |
52 | { | 53 | { |
53 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | 54 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
55 | cleancache_flush_page(page->mapping, page); | ||
54 | if (page_has_private(page)) | 56 | if (page_has_private(page)) |
55 | do_invalidatepage(page, partial); | 57 | do_invalidatepage(page, partial); |
56 | } | 58 | } |
@@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
214 | pgoff_t next; | 216 | pgoff_t next; |
215 | int i; | 217 | int i; |
216 | 218 | ||
219 | cleancache_flush_inode(mapping); | ||
217 | if (mapping->nrpages == 0) | 220 | if (mapping->nrpages == 0) |
218 | return; | 221 | return; |
219 | 222 | ||
@@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
291 | pagevec_release(&pvec); | 294 | pagevec_release(&pvec); |
292 | mem_cgroup_uncharge_end(); | 295 | mem_cgroup_uncharge_end(); |
293 | } | 296 | } |
297 | cleancache_flush_inode(mapping); | ||
294 | } | 298 | } |
295 | EXPORT_SYMBOL(truncate_inode_pages_range); | 299 | EXPORT_SYMBOL(truncate_inode_pages_range); |
296 | 300 | ||
@@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
440 | int did_range_unmap = 0; | 444 | int did_range_unmap = 0; |
441 | int wrapped = 0; | 445 | int wrapped = 0; |
442 | 446 | ||
447 | cleancache_flush_inode(mapping); | ||
443 | pagevec_init(&pvec, 0); | 448 | pagevec_init(&pvec, 0); |
444 | next = start; | 449 | next = start; |
445 | while (next <= end && !wrapped && | 450 | while (next <= end && !wrapped && |
@@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
498 | mem_cgroup_uncharge_end(); | 503 | mem_cgroup_uncharge_end(); |
499 | cond_resched(); | 504 | cond_resched(); |
500 | } | 505 | } |
506 | cleancache_flush_inode(mapping); | ||
501 | return ret; | 507 | return ret; |
502 | } | 508 | } |
503 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | 509 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); |
@@ -6,6 +6,8 @@ | |||
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
8 | 8 | ||
9 | #include "internal.h" | ||
10 | |||
9 | #define CREATE_TRACE_POINTS | 11 | #define CREATE_TRACE_POINTS |
10 | #include <trace/events/kmem.h> | 12 | #include <trace/events/kmem.h> |
11 | 13 | ||
@@ -215,6 +217,28 @@ char *strndup_user(const char __user *s, long n) | |||
215 | } | 217 | } |
216 | EXPORT_SYMBOL(strndup_user); | 218 | EXPORT_SYMBOL(strndup_user); |
217 | 219 | ||
220 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | ||
221 | struct vm_area_struct *prev, struct rb_node *rb_parent) | ||
222 | { | ||
223 | struct vm_area_struct *next; | ||
224 | |||
225 | vma->vm_prev = prev; | ||
226 | if (prev) { | ||
227 | next = prev->vm_next; | ||
228 | prev->vm_next = vma; | ||
229 | } else { | ||
230 | mm->mmap = vma; | ||
231 | if (rb_parent) | ||
232 | next = rb_entry(rb_parent, | ||
233 | struct vm_area_struct, vm_rb); | ||
234 | else | ||
235 | next = NULL; | ||
236 | } | ||
237 | vma->vm_next = next; | ||
238 | if (next) | ||
239 | next->vm_prev = vma; | ||
240 | } | ||
241 | |||
218 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | 242 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
219 | void arch_pick_mmap_layout(struct mm_struct *mm) | 243 | void arch_pick_mmap_layout(struct mm_struct *mm) |
220 | { | 244 | { |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fdf4b1e88e53..1d34d75366a7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -375,7 +375,7 @@ nocache: | |||
375 | /* find starting point for our search */ | 375 | /* find starting point for our search */ |
376 | if (free_vmap_cache) { | 376 | if (free_vmap_cache) { |
377 | first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); | 377 | first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); |
378 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | 378 | addr = ALIGN(first->va_end, align); |
379 | if (addr < vstart) | 379 | if (addr < vstart) |
380 | goto nocache; | 380 | goto nocache; |
381 | if (addr + size - 1 < addr) | 381 | if (addr + size - 1 < addr) |
@@ -406,10 +406,10 @@ nocache: | |||
406 | } | 406 | } |
407 | 407 | ||
408 | /* from the starting point, walk areas until a suitable hole is found */ | 408 | /* from the starting point, walk areas until a suitable hole is found */ |
409 | while (addr + size >= first->va_start && addr + size <= vend) { | 409 | while (addr + size > first->va_start && addr + size <= vend) { |
410 | if (addr + cached_hole_size < first->va_start) | 410 | if (addr + cached_hole_size < first->va_start) |
411 | cached_hole_size = first->va_start - addr; | 411 | cached_hole_size = first->va_start - addr; |
412 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | 412 | addr = ALIGN(first->va_end, align); |
413 | if (addr + size - 1 < addr) | 413 | if (addr + size - 1 < addr) |
414 | goto overflow; | 414 | goto overflow; |
415 | 415 | ||
@@ -1534,6 +1534,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1534 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1534 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1535 | pgprot_t prot, int node, void *caller) | 1535 | pgprot_t prot, int node, void *caller) |
1536 | { | 1536 | { |
1537 | const int order = 0; | ||
1537 | struct page **pages; | 1538 | struct page **pages; |
1538 | unsigned int nr_pages, array_size, i; | 1539 | unsigned int nr_pages, array_size, i; |
1539 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | 1540 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; |
@@ -1560,11 +1561,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1560 | 1561 | ||
1561 | for (i = 0; i < area->nr_pages; i++) { | 1562 | for (i = 0; i < area->nr_pages; i++) { |
1562 | struct page *page; | 1563 | struct page *page; |
1564 | gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; | ||
1563 | 1565 | ||
1564 | if (node < 0) | 1566 | if (node < 0) |
1565 | page = alloc_page(gfp_mask); | 1567 | page = alloc_page(tmp_mask); |
1566 | else | 1568 | else |
1567 | page = alloc_pages_node(node, gfp_mask, 0); | 1569 | page = alloc_pages_node(node, tmp_mask, order); |
1568 | 1570 | ||
1569 | if (unlikely(!page)) { | 1571 | if (unlikely(!page)) { |
1570 | /* Successfully allocated i pages, free them in __vunmap() */ | 1572 | /* Successfully allocated i pages, free them in __vunmap() */ |
@@ -1579,6 +1581,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1579 | return area->addr; | 1581 | return area->addr; |
1580 | 1582 | ||
1581 | fail: | 1583 | fail: |
1584 | warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " | ||
1585 | "allocated %ld of %ld bytes\n", | ||
1586 | (area->nr_pages*PAGE_SIZE), area->size); | ||
1582 | vfree(area->addr); | 1587 | vfree(area->addr); |
1583 | return NULL; | 1588 | return NULL; |
1584 | } | 1589 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8bfd45050a61..faa0a088f9cc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <linux/delayacct.h> | 42 | #include <linux/delayacct.h> |
43 | #include <linux/sysctl.h> | 43 | #include <linux/sysctl.h> |
44 | #include <linux/oom.h> | 44 | #include <linux/oom.h> |
45 | #include <linux/prefetch.h> | ||
45 | 46 | ||
46 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
47 | #include <asm/div64.h> | 48 | #include <asm/div64.h> |
@@ -172,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, | |||
172 | struct scan_control *sc, enum lru_list lru) | 173 | struct scan_control *sc, enum lru_list lru) |
173 | { | 174 | { |
174 | if (!scanning_global_lru(sc)) | 175 | if (!scanning_global_lru(sc)) |
175 | return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); | 176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); |
176 | 177 | ||
177 | return zone_page_state(zone, NR_LRU_BASE + lru); | 178 | return zone_page_state(zone, NR_LRU_BASE + lru); |
178 | } | 179 | } |
@@ -201,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker) | |||
201 | } | 202 | } |
202 | EXPORT_SYMBOL(unregister_shrinker); | 203 | EXPORT_SYMBOL(unregister_shrinker); |
203 | 204 | ||
205 | static inline int do_shrinker_shrink(struct shrinker *shrinker, | ||
206 | struct shrink_control *sc, | ||
207 | unsigned long nr_to_scan) | ||
208 | { | ||
209 | sc->nr_to_scan = nr_to_scan; | ||
210 | return (*shrinker->shrink)(shrinker, sc); | ||
211 | } | ||
212 | |||
204 | #define SHRINK_BATCH 128 | 213 | #define SHRINK_BATCH 128 |
205 | /* | 214 | /* |
206 | * Call the shrink functions to age shrinkable caches | 215 | * Call the shrink functions to age shrinkable caches |
@@ -221,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker); | |||
221 | * | 230 | * |
222 | * Returns the number of slab objects which we shrunk. | 231 | * Returns the number of slab objects which we shrunk. |
223 | */ | 232 | */ |
224 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | 233 | unsigned long shrink_slab(struct shrink_control *shrink, |
225 | unsigned long lru_pages) | 234 | unsigned long nr_pages_scanned, |
235 | unsigned long lru_pages) | ||
226 | { | 236 | { |
227 | struct shrinker *shrinker; | 237 | struct shrinker *shrinker; |
228 | unsigned long ret = 0; | 238 | unsigned long ret = 0; |
229 | 239 | ||
230 | if (scanned == 0) | 240 | if (nr_pages_scanned == 0) |
231 | scanned = SWAP_CLUSTER_MAX; | 241 | nr_pages_scanned = SWAP_CLUSTER_MAX; |
232 | 242 | ||
233 | if (!down_read_trylock(&shrinker_rwsem)) | 243 | if (!down_read_trylock(&shrinker_rwsem)) { |
234 | return 1; /* Assume we'll be able to shrink next time */ | 244 | /* Assume we'll be able to shrink next time */ |
245 | ret = 1; | ||
246 | goto out; | ||
247 | } | ||
235 | 248 | ||
236 | list_for_each_entry(shrinker, &shrinker_list, list) { | 249 | list_for_each_entry(shrinker, &shrinker_list, list) { |
237 | unsigned long long delta; | 250 | unsigned long long delta; |
238 | unsigned long total_scan; | 251 | unsigned long total_scan; |
239 | unsigned long max_pass; | 252 | unsigned long max_pass; |
240 | 253 | ||
241 | max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); | 254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
242 | delta = (4 * scanned) / shrinker->seeks; | 255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
243 | delta *= max_pass; | 256 | delta *= max_pass; |
244 | do_div(delta, lru_pages + 1); | 257 | do_div(delta, lru_pages + 1); |
245 | shrinker->nr += delta; | 258 | shrinker->nr += delta; |
@@ -266,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
266 | int shrink_ret; | 279 | int shrink_ret; |
267 | int nr_before; | 280 | int nr_before; |
268 | 281 | ||
269 | nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); | 282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
270 | shrink_ret = (*shrinker->shrink)(shrinker, this_scan, | 283 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
271 | gfp_mask); | 284 | this_scan); |
272 | if (shrink_ret == -1) | 285 | if (shrink_ret == -1) |
273 | break; | 286 | break; |
274 | if (shrink_ret < nr_before) | 287 | if (shrink_ret < nr_before) |
@@ -282,6 +295,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
282 | shrinker->nr += total_scan; | 295 | shrinker->nr += total_scan; |
283 | } | 296 | } |
284 | up_read(&shrinker_rwsem); | 297 | up_read(&shrinker_rwsem); |
298 | out: | ||
299 | cond_resched(); | ||
285 | return ret; | 300 | return ret; |
286 | } | 301 | } |
287 | 302 | ||
@@ -1201,13 +1216,16 @@ int isolate_lru_page(struct page *page) | |||
1201 | { | 1216 | { |
1202 | int ret = -EBUSY; | 1217 | int ret = -EBUSY; |
1203 | 1218 | ||
1219 | VM_BUG_ON(!page_count(page)); | ||
1220 | |||
1204 | if (PageLRU(page)) { | 1221 | if (PageLRU(page)) { |
1205 | struct zone *zone = page_zone(page); | 1222 | struct zone *zone = page_zone(page); |
1206 | 1223 | ||
1207 | spin_lock_irq(&zone->lru_lock); | 1224 | spin_lock_irq(&zone->lru_lock); |
1208 | if (PageLRU(page) && get_page_unless_zero(page)) { | 1225 | if (PageLRU(page)) { |
1209 | int lru = page_lru(page); | 1226 | int lru = page_lru(page); |
1210 | ret = 0; | 1227 | ret = 0; |
1228 | get_page(page); | ||
1211 | ClearPageLRU(page); | 1229 | ClearPageLRU(page); |
1212 | 1230 | ||
1213 | del_page_from_lru_list(zone, page, lru); | 1231 | del_page_from_lru_list(zone, page, lru); |
@@ -1700,26 +1718,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1700 | } | 1718 | } |
1701 | 1719 | ||
1702 | /* | 1720 | /* |
1703 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
1704 | * until we collected @swap_cluster_max pages to scan. | ||
1705 | */ | ||
1706 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
1707 | unsigned long *nr_saved_scan) | ||
1708 | { | ||
1709 | unsigned long nr; | ||
1710 | |||
1711 | *nr_saved_scan += nr_to_scan; | ||
1712 | nr = *nr_saved_scan; | ||
1713 | |||
1714 | if (nr >= SWAP_CLUSTER_MAX) | ||
1715 | *nr_saved_scan = 0; | ||
1716 | else | ||
1717 | nr = 0; | ||
1718 | |||
1719 | return nr; | ||
1720 | } | ||
1721 | |||
1722 | /* | ||
1723 | * Determine how aggressively the anon and file LRU lists should be | 1721 | * Determine how aggressively the anon and file LRU lists should be |
1724 | * scanned. The relative value of each set of LRU lists is determined | 1722 | * scanned. The relative value of each set of LRU lists is determined |
1725 | * by looking at the fraction of the pages scanned we did rotate back | 1723 | * by looking at the fraction of the pages scanned we did rotate back |
@@ -1737,6 +1735,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1737 | u64 fraction[2], denominator; | 1735 | u64 fraction[2], denominator; |
1738 | enum lru_list l; | 1736 | enum lru_list l; |
1739 | int noswap = 0; | 1737 | int noswap = 0; |
1738 | int force_scan = 0; | ||
1739 | |||
1740 | |||
1741 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1742 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1743 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1744 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1745 | |||
1746 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | ||
1747 | /* kswapd does zone balancing and need to scan this zone */ | ||
1748 | if (scanning_global_lru(sc) && current_is_kswapd()) | ||
1749 | force_scan = 1; | ||
1750 | /* memcg may have small limit and need to avoid priority drop */ | ||
1751 | if (!scanning_global_lru(sc)) | ||
1752 | force_scan = 1; | ||
1753 | } | ||
1740 | 1754 | ||
1741 | /* If we have no swap space, do not bother scanning anon pages. */ | 1755 | /* If we have no swap space, do not bother scanning anon pages. */ |
1742 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1756 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1747,11 +1761,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1747 | goto out; | 1761 | goto out; |
1748 | } | 1762 | } |
1749 | 1763 | ||
1750 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1751 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1752 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1753 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1754 | |||
1755 | if (scanning_global_lru(sc)) { | 1764 | if (scanning_global_lru(sc)) { |
1756 | free = zone_page_state(zone, NR_FREE_PAGES); | 1765 | free = zone_page_state(zone, NR_FREE_PAGES); |
1757 | /* If we have very few page cache pages, | 1766 | /* If we have very few page cache pages, |
@@ -1818,8 +1827,23 @@ out: | |||
1818 | scan >>= priority; | 1827 | scan >>= priority; |
1819 | scan = div64_u64(scan * fraction[file], denominator); | 1828 | scan = div64_u64(scan * fraction[file], denominator); |
1820 | } | 1829 | } |
1821 | nr[l] = nr_scan_try_batch(scan, | 1830 | |
1822 | &reclaim_stat->nr_saved_scan[l]); | 1831 | /* |
1832 | * If zone is small or memcg is small, nr[l] can be 0. | ||
1833 | * This results no-scan on this priority and priority drop down. | ||
1834 | * For global direct reclaim, it can visit next zone and tend | ||
1835 | * not to have problems. For global kswapd, it's for zone | ||
1836 | * balancing and it need to scan a small amounts. When using | ||
1837 | * memcg, priority drop can cause big latency. So, it's better | ||
1838 | * to scan small amount. See may_noscan above. | ||
1839 | */ | ||
1840 | if (!scan && force_scan) { | ||
1841 | if (file) | ||
1842 | scan = SWAP_CLUSTER_MAX; | ||
1843 | else if (!noswap) | ||
1844 | scan = SWAP_CLUSTER_MAX; | ||
1845 | } | ||
1846 | nr[l] = scan; | ||
1823 | } | 1847 | } |
1824 | } | 1848 | } |
1825 | 1849 | ||
@@ -1959,11 +1983,14 @@ restart: | |||
1959 | * If a zone is deemed to be full of pinned pages then just give it a light | 1983 | * If a zone is deemed to be full of pinned pages then just give it a light |
1960 | * scan then give up on it. | 1984 | * scan then give up on it. |
1961 | */ | 1985 | */ |
1962 | static void shrink_zones(int priority, struct zonelist *zonelist, | 1986 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, |
1963 | struct scan_control *sc) | 1987 | struct scan_control *sc) |
1964 | { | 1988 | { |
1965 | struct zoneref *z; | 1989 | struct zoneref *z; |
1966 | struct zone *zone; | 1990 | struct zone *zone; |
1991 | unsigned long nr_soft_reclaimed; | ||
1992 | unsigned long nr_soft_scanned; | ||
1993 | unsigned long total_scanned = 0; | ||
1967 | 1994 | ||
1968 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1995 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1969 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 1996 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -1980,8 +2007,17 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
1980 | continue; /* Let kswapd poll it */ | 2007 | continue; /* Let kswapd poll it */ |
1981 | } | 2008 | } |
1982 | 2009 | ||
2010 | nr_soft_scanned = 0; | ||
2011 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2012 | sc->order, sc->gfp_mask, | ||
2013 | &nr_soft_scanned); | ||
2014 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2015 | total_scanned += nr_soft_scanned; | ||
2016 | |||
1983 | shrink_zone(priority, zone, sc); | 2017 | shrink_zone(priority, zone, sc); |
1984 | } | 2018 | } |
2019 | |||
2020 | return total_scanned; | ||
1985 | } | 2021 | } |
1986 | 2022 | ||
1987 | static bool zone_reclaimable(struct zone *zone) | 2023 | static bool zone_reclaimable(struct zone *zone) |
@@ -2026,7 +2062,8 @@ static bool all_unreclaimable(struct zonelist *zonelist, | |||
2026 | * else, the number of pages reclaimed | 2062 | * else, the number of pages reclaimed |
2027 | */ | 2063 | */ |
2028 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | 2064 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
2029 | struct scan_control *sc) | 2065 | struct scan_control *sc, |
2066 | struct shrink_control *shrink) | ||
2030 | { | 2067 | { |
2031 | int priority; | 2068 | int priority; |
2032 | unsigned long total_scanned = 0; | 2069 | unsigned long total_scanned = 0; |
@@ -2045,7 +2082,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2045 | sc->nr_scanned = 0; | 2082 | sc->nr_scanned = 0; |
2046 | if (!priority) | 2083 | if (!priority) |
2047 | disable_swap_token(); | 2084 | disable_swap_token(); |
2048 | shrink_zones(priority, zonelist, sc); | 2085 | total_scanned += shrink_zones(priority, zonelist, sc); |
2049 | /* | 2086 | /* |
2050 | * Don't shrink slabs when reclaiming memory from | 2087 | * Don't shrink slabs when reclaiming memory from |
2051 | * over limit cgroups | 2088 | * over limit cgroups |
@@ -2060,7 +2097,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2060 | lru_pages += zone_reclaimable_pages(zone); | 2097 | lru_pages += zone_reclaimable_pages(zone); |
2061 | } | 2098 | } |
2062 | 2099 | ||
2063 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); | 2100 | shrink_slab(shrink, sc->nr_scanned, lru_pages); |
2064 | if (reclaim_state) { | 2101 | if (reclaim_state) { |
2065 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | 2102 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
2066 | reclaim_state->reclaimed_slab = 0; | 2103 | reclaim_state->reclaimed_slab = 0; |
@@ -2132,12 +2169,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2132 | .mem_cgroup = NULL, | 2169 | .mem_cgroup = NULL, |
2133 | .nodemask = nodemask, | 2170 | .nodemask = nodemask, |
2134 | }; | 2171 | }; |
2172 | struct shrink_control shrink = { | ||
2173 | .gfp_mask = sc.gfp_mask, | ||
2174 | }; | ||
2135 | 2175 | ||
2136 | trace_mm_vmscan_direct_reclaim_begin(order, | 2176 | trace_mm_vmscan_direct_reclaim_begin(order, |
2137 | sc.may_writepage, | 2177 | sc.may_writepage, |
2138 | gfp_mask); | 2178 | gfp_mask); |
2139 | 2179 | ||
2140 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 2180 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2141 | 2181 | ||
2142 | trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); | 2182 | trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); |
2143 | 2183 | ||
@@ -2149,9 +2189,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2149 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2189 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2150 | gfp_t gfp_mask, bool noswap, | 2190 | gfp_t gfp_mask, bool noswap, |
2151 | unsigned int swappiness, | 2191 | unsigned int swappiness, |
2152 | struct zone *zone) | 2192 | struct zone *zone, |
2193 | unsigned long *nr_scanned) | ||
2153 | { | 2194 | { |
2154 | struct scan_control sc = { | 2195 | struct scan_control sc = { |
2196 | .nr_scanned = 0, | ||
2155 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2197 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2156 | .may_writepage = !laptop_mode, | 2198 | .may_writepage = !laptop_mode, |
2157 | .may_unmap = 1, | 2199 | .may_unmap = 1, |
@@ -2160,6 +2202,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2160 | .order = 0, | 2202 | .order = 0, |
2161 | .mem_cgroup = mem, | 2203 | .mem_cgroup = mem, |
2162 | }; | 2204 | }; |
2205 | |||
2163 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2206 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2164 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2207 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
2165 | 2208 | ||
@@ -2178,6 +2221,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2178 | 2221 | ||
2179 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2222 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2180 | 2223 | ||
2224 | *nr_scanned = sc.nr_scanned; | ||
2181 | return sc.nr_reclaimed; | 2225 | return sc.nr_reclaimed; |
2182 | } | 2226 | } |
2183 | 2227 | ||
@@ -2188,6 +2232,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2188 | { | 2232 | { |
2189 | struct zonelist *zonelist; | 2233 | struct zonelist *zonelist; |
2190 | unsigned long nr_reclaimed; | 2234 | unsigned long nr_reclaimed; |
2235 | int nid; | ||
2191 | struct scan_control sc = { | 2236 | struct scan_control sc = { |
2192 | .may_writepage = !laptop_mode, | 2237 | .may_writepage = !laptop_mode, |
2193 | .may_unmap = 1, | 2238 | .may_unmap = 1, |
@@ -2197,17 +2242,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2197 | .order = 0, | 2242 | .order = 0, |
2198 | .mem_cgroup = mem_cont, | 2243 | .mem_cgroup = mem_cont, |
2199 | .nodemask = NULL, /* we don't care the placement */ | 2244 | .nodemask = NULL, /* we don't care the placement */ |
2245 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | ||
2246 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | ||
2247 | }; | ||
2248 | struct shrink_control shrink = { | ||
2249 | .gfp_mask = sc.gfp_mask, | ||
2200 | }; | 2250 | }; |
2201 | 2251 | ||
2202 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2252 | /* |
2203 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2253 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't |
2204 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; | 2254 | * take care of from where we get pages. So the node where we start the |
2255 | * scan does not need to be the current node. | ||
2256 | */ | ||
2257 | nid = mem_cgroup_select_victim_node(mem_cont); | ||
2258 | |||
2259 | zonelist = NODE_DATA(nid)->node_zonelists; | ||
2205 | 2260 | ||
2206 | trace_mm_vmscan_memcg_reclaim_begin(0, | 2261 | trace_mm_vmscan_memcg_reclaim_begin(0, |
2207 | sc.may_writepage, | 2262 | sc.may_writepage, |
2208 | sc.gfp_mask); | 2263 | sc.gfp_mask); |
2209 | 2264 | ||
2210 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 2265 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2211 | 2266 | ||
2212 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 2267 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
2213 | 2268 | ||
@@ -2286,7 +2341,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2286 | * must be balanced | 2341 | * must be balanced |
2287 | */ | 2342 | */ |
2288 | if (order) | 2343 | if (order) |
2289 | return pgdat_balanced(pgdat, balanced, classzone_idx); | 2344 | return !pgdat_balanced(pgdat, balanced, classzone_idx); |
2290 | else | 2345 | else |
2291 | return !all_zones_ok; | 2346 | return !all_zones_ok; |
2292 | } | 2347 | } |
@@ -2322,6 +2377,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2322 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2377 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2323 | unsigned long total_scanned; | 2378 | unsigned long total_scanned; |
2324 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2379 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2380 | unsigned long nr_soft_reclaimed; | ||
2381 | unsigned long nr_soft_scanned; | ||
2325 | struct scan_control sc = { | 2382 | struct scan_control sc = { |
2326 | .gfp_mask = GFP_KERNEL, | 2383 | .gfp_mask = GFP_KERNEL, |
2327 | .may_unmap = 1, | 2384 | .may_unmap = 1, |
@@ -2335,6 +2392,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2335 | .order = order, | 2392 | .order = order, |
2336 | .mem_cgroup = NULL, | 2393 | .mem_cgroup = NULL, |
2337 | }; | 2394 | }; |
2395 | struct shrink_control shrink = { | ||
2396 | .gfp_mask = sc.gfp_mask, | ||
2397 | }; | ||
2338 | loop_again: | 2398 | loop_again: |
2339 | total_scanned = 0; | 2399 | total_scanned = 0; |
2340 | sc.nr_reclaimed = 0; | 2400 | sc.nr_reclaimed = 0; |
@@ -2411,11 +2471,15 @@ loop_again: | |||
2411 | 2471 | ||
2412 | sc.nr_scanned = 0; | 2472 | sc.nr_scanned = 0; |
2413 | 2473 | ||
2474 | nr_soft_scanned = 0; | ||
2414 | /* | 2475 | /* |
2415 | * Call soft limit reclaim before calling shrink_zone. | 2476 | * Call soft limit reclaim before calling shrink_zone. |
2416 | * For now we ignore the return value | ||
2417 | */ | 2477 | */ |
2418 | mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); | 2478 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, |
2479 | order, sc.gfp_mask, | ||
2480 | &nr_soft_scanned); | ||
2481 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
2482 | total_scanned += nr_soft_scanned; | ||
2419 | 2483 | ||
2420 | /* | 2484 | /* |
2421 | * We put equal pressure on every zone, unless | 2485 | * We put equal pressure on every zone, unless |
@@ -2434,8 +2498,7 @@ loop_again: | |||
2434 | end_zone, 0)) | 2498 | end_zone, 0)) |
2435 | shrink_zone(priority, zone, &sc); | 2499 | shrink_zone(priority, zone, &sc); |
2436 | reclaim_state->reclaimed_slab = 0; | 2500 | reclaim_state->reclaimed_slab = 0; |
2437 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 2501 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
2438 | lru_pages); | ||
2439 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2502 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2440 | total_scanned += sc.nr_scanned; | 2503 | total_scanned += sc.nr_scanned; |
2441 | 2504 | ||
@@ -2787,7 +2850,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2787 | .swappiness = vm_swappiness, | 2850 | .swappiness = vm_swappiness, |
2788 | .order = 0, | 2851 | .order = 0, |
2789 | }; | 2852 | }; |
2790 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | 2853 | struct shrink_control shrink = { |
2854 | .gfp_mask = sc.gfp_mask, | ||
2855 | }; | ||
2856 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
2791 | struct task_struct *p = current; | 2857 | struct task_struct *p = current; |
2792 | unsigned long nr_reclaimed; | 2858 | unsigned long nr_reclaimed; |
2793 | 2859 | ||
@@ -2796,7 +2862,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2796 | reclaim_state.reclaimed_slab = 0; | 2862 | reclaim_state.reclaimed_slab = 0; |
2797 | p->reclaim_state = &reclaim_state; | 2863 | p->reclaim_state = &reclaim_state; |
2798 | 2864 | ||
2799 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 2865 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2800 | 2866 | ||
2801 | p->reclaim_state = NULL; | 2867 | p->reclaim_state = NULL; |
2802 | lockdep_clear_current_reclaim_state(); | 2868 | lockdep_clear_current_reclaim_state(); |
@@ -2971,6 +3037,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2971 | .swappiness = vm_swappiness, | 3037 | .swappiness = vm_swappiness, |
2972 | .order = order, | 3038 | .order = order, |
2973 | }; | 3039 | }; |
3040 | struct shrink_control shrink = { | ||
3041 | .gfp_mask = sc.gfp_mask, | ||
3042 | }; | ||
2974 | unsigned long nr_slab_pages0, nr_slab_pages1; | 3043 | unsigned long nr_slab_pages0, nr_slab_pages1; |
2975 | 3044 | ||
2976 | cond_resched(); | 3045 | cond_resched(); |
@@ -3012,7 +3081,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3012 | unsigned long lru_pages = zone_reclaimable_pages(zone); | 3081 | unsigned long lru_pages = zone_reclaimable_pages(zone); |
3013 | 3082 | ||
3014 | /* No reclaimable slab or very low memory pressure */ | 3083 | /* No reclaimable slab or very low memory pressure */ |
3015 | if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) | 3084 | if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) |
3016 | break; | 3085 | break; |
3017 | 3086 | ||
3018 | /* Freed enough memory */ | 3087 | /* Freed enough memory */ |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 897ea9e88238..20c18b7694b2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -157,7 +157,7 @@ int calculate_normal_threshold(struct zone *zone) | |||
157 | /* | 157 | /* |
158 | * Refresh the thresholds for each zone. | 158 | * Refresh the thresholds for each zone. |
159 | */ | 159 | */ |
160 | static void refresh_zone_stat_thresholds(void) | 160 | void refresh_zone_stat_thresholds(void) |
161 | { | 161 | { |
162 | struct zone *zone; | 162 | struct zone *zone; |
163 | int cpu; | 163 | int cpu; |
@@ -659,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
659 | } | 659 | } |
660 | #endif | 660 | #endif |
661 | 661 | ||
662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) | ||
663 | #ifdef CONFIG_ZONE_DMA | ||
664 | #define TEXT_FOR_DMA(xx) xx "_dma", | ||
665 | #else | ||
666 | #define TEXT_FOR_DMA(xx) | ||
667 | #endif | ||
668 | |||
669 | #ifdef CONFIG_ZONE_DMA32 | ||
670 | #define TEXT_FOR_DMA32(xx) xx "_dma32", | ||
671 | #else | ||
672 | #define TEXT_FOR_DMA32(xx) | ||
673 | #endif | ||
674 | |||
675 | #ifdef CONFIG_HIGHMEM | ||
676 | #define TEXT_FOR_HIGHMEM(xx) xx "_high", | ||
677 | #else | ||
678 | #define TEXT_FOR_HIGHMEM(xx) | ||
679 | #endif | ||
680 | |||
681 | #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ | ||
682 | TEXT_FOR_HIGHMEM(xx) xx "_movable", | ||
683 | |||
684 | const char * const vmstat_text[] = { | ||
685 | /* Zoned VM counters */ | ||
686 | "nr_free_pages", | ||
687 | "nr_inactive_anon", | ||
688 | "nr_active_anon", | ||
689 | "nr_inactive_file", | ||
690 | "nr_active_file", | ||
691 | "nr_unevictable", | ||
692 | "nr_mlock", | ||
693 | "nr_anon_pages", | ||
694 | "nr_mapped", | ||
695 | "nr_file_pages", | ||
696 | "nr_dirty", | ||
697 | "nr_writeback", | ||
698 | "nr_slab_reclaimable", | ||
699 | "nr_slab_unreclaimable", | ||
700 | "nr_page_table_pages", | ||
701 | "nr_kernel_stack", | ||
702 | "nr_unstable", | ||
703 | "nr_bounce", | ||
704 | "nr_vmscan_write", | ||
705 | "nr_writeback_temp", | ||
706 | "nr_isolated_anon", | ||
707 | "nr_isolated_file", | ||
708 | "nr_shmem", | ||
709 | "nr_dirtied", | ||
710 | "nr_written", | ||
711 | |||
712 | #ifdef CONFIG_NUMA | ||
713 | "numa_hit", | ||
714 | "numa_miss", | ||
715 | "numa_foreign", | ||
716 | "numa_interleave", | ||
717 | "numa_local", | ||
718 | "numa_other", | ||
719 | #endif | ||
720 | "nr_anon_transparent_hugepages", | ||
721 | "nr_dirty_threshold", | ||
722 | "nr_dirty_background_threshold", | ||
723 | |||
724 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
725 | "pgpgin", | ||
726 | "pgpgout", | ||
727 | "pswpin", | ||
728 | "pswpout", | ||
729 | |||
730 | TEXTS_FOR_ZONES("pgalloc") | ||
731 | |||
732 | "pgfree", | ||
733 | "pgactivate", | ||
734 | "pgdeactivate", | ||
735 | |||
736 | "pgfault", | ||
737 | "pgmajfault", | ||
738 | |||
739 | TEXTS_FOR_ZONES("pgrefill") | ||
740 | TEXTS_FOR_ZONES("pgsteal") | ||
741 | TEXTS_FOR_ZONES("pgscan_kswapd") | ||
742 | TEXTS_FOR_ZONES("pgscan_direct") | ||
743 | |||
744 | #ifdef CONFIG_NUMA | ||
745 | "zone_reclaim_failed", | ||
746 | #endif | ||
747 | "pginodesteal", | ||
748 | "slabs_scanned", | ||
749 | "kswapd_steal", | ||
750 | "kswapd_inodesteal", | ||
751 | "kswapd_low_wmark_hit_quickly", | ||
752 | "kswapd_high_wmark_hit_quickly", | ||
753 | "kswapd_skip_congestion_wait", | ||
754 | "pageoutrun", | ||
755 | "allocstall", | ||
756 | |||
757 | "pgrotated", | ||
758 | |||
759 | #ifdef CONFIG_COMPACTION | ||
760 | "compact_blocks_moved", | ||
761 | "compact_pages_moved", | ||
762 | "compact_pagemigrate_failed", | ||
763 | "compact_stall", | ||
764 | "compact_fail", | ||
765 | "compact_success", | ||
766 | #endif | ||
767 | |||
768 | #ifdef CONFIG_HUGETLB_PAGE | ||
769 | "htlb_buddy_alloc_success", | ||
770 | "htlb_buddy_alloc_fail", | ||
771 | #endif | ||
772 | "unevictable_pgs_culled", | ||
773 | "unevictable_pgs_scanned", | ||
774 | "unevictable_pgs_rescued", | ||
775 | "unevictable_pgs_mlocked", | ||
776 | "unevictable_pgs_munlocked", | ||
777 | "unevictable_pgs_cleared", | ||
778 | "unevictable_pgs_stranded", | ||
779 | "unevictable_pgs_mlockfreed", | ||
780 | |||
781 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
782 | "thp_fault_alloc", | ||
783 | "thp_fault_fallback", | ||
784 | "thp_collapse_alloc", | ||
785 | "thp_collapse_alloc_failed", | ||
786 | "thp_split", | ||
787 | #endif | ||
788 | |||
789 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | ||
790 | }; | ||
791 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ | ||
792 | |||
793 | |||
662 | #ifdef CONFIG_PROC_FS | 794 | #ifdef CONFIG_PROC_FS |
663 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | 795 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, |
664 | struct zone *zone) | 796 | struct zone *zone) |
@@ -831,135 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = { | |||
831 | .release = seq_release, | 963 | .release = seq_release, |
832 | }; | 964 | }; |
833 | 965 | ||
834 | #ifdef CONFIG_ZONE_DMA | ||
835 | #define TEXT_FOR_DMA(xx) xx "_dma", | ||
836 | #else | ||
837 | #define TEXT_FOR_DMA(xx) | ||
838 | #endif | ||
839 | |||
840 | #ifdef CONFIG_ZONE_DMA32 | ||
841 | #define TEXT_FOR_DMA32(xx) xx "_dma32", | ||
842 | #else | ||
843 | #define TEXT_FOR_DMA32(xx) | ||
844 | #endif | ||
845 | |||
846 | #ifdef CONFIG_HIGHMEM | ||
847 | #define TEXT_FOR_HIGHMEM(xx) xx "_high", | ||
848 | #else | ||
849 | #define TEXT_FOR_HIGHMEM(xx) | ||
850 | #endif | ||
851 | |||
852 | #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ | ||
853 | TEXT_FOR_HIGHMEM(xx) xx "_movable", | ||
854 | |||
855 | static const char * const vmstat_text[] = { | ||
856 | /* Zoned VM counters */ | ||
857 | "nr_free_pages", | ||
858 | "nr_inactive_anon", | ||
859 | "nr_active_anon", | ||
860 | "nr_inactive_file", | ||
861 | "nr_active_file", | ||
862 | "nr_unevictable", | ||
863 | "nr_mlock", | ||
864 | "nr_anon_pages", | ||
865 | "nr_mapped", | ||
866 | "nr_file_pages", | ||
867 | "nr_dirty", | ||
868 | "nr_writeback", | ||
869 | "nr_slab_reclaimable", | ||
870 | "nr_slab_unreclaimable", | ||
871 | "nr_page_table_pages", | ||
872 | "nr_kernel_stack", | ||
873 | "nr_unstable", | ||
874 | "nr_bounce", | ||
875 | "nr_vmscan_write", | ||
876 | "nr_writeback_temp", | ||
877 | "nr_isolated_anon", | ||
878 | "nr_isolated_file", | ||
879 | "nr_shmem", | ||
880 | "nr_dirtied", | ||
881 | "nr_written", | ||
882 | |||
883 | #ifdef CONFIG_NUMA | ||
884 | "numa_hit", | ||
885 | "numa_miss", | ||
886 | "numa_foreign", | ||
887 | "numa_interleave", | ||
888 | "numa_local", | ||
889 | "numa_other", | ||
890 | #endif | ||
891 | "nr_anon_transparent_hugepages", | ||
892 | "nr_dirty_threshold", | ||
893 | "nr_dirty_background_threshold", | ||
894 | |||
895 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
896 | "pgpgin", | ||
897 | "pgpgout", | ||
898 | "pswpin", | ||
899 | "pswpout", | ||
900 | |||
901 | TEXTS_FOR_ZONES("pgalloc") | ||
902 | |||
903 | "pgfree", | ||
904 | "pgactivate", | ||
905 | "pgdeactivate", | ||
906 | |||
907 | "pgfault", | ||
908 | "pgmajfault", | ||
909 | |||
910 | TEXTS_FOR_ZONES("pgrefill") | ||
911 | TEXTS_FOR_ZONES("pgsteal") | ||
912 | TEXTS_FOR_ZONES("pgscan_kswapd") | ||
913 | TEXTS_FOR_ZONES("pgscan_direct") | ||
914 | |||
915 | #ifdef CONFIG_NUMA | ||
916 | "zone_reclaim_failed", | ||
917 | #endif | ||
918 | "pginodesteal", | ||
919 | "slabs_scanned", | ||
920 | "kswapd_steal", | ||
921 | "kswapd_inodesteal", | ||
922 | "kswapd_low_wmark_hit_quickly", | ||
923 | "kswapd_high_wmark_hit_quickly", | ||
924 | "kswapd_skip_congestion_wait", | ||
925 | "pageoutrun", | ||
926 | "allocstall", | ||
927 | |||
928 | "pgrotated", | ||
929 | |||
930 | #ifdef CONFIG_COMPACTION | ||
931 | "compact_blocks_moved", | ||
932 | "compact_pages_moved", | ||
933 | "compact_pagemigrate_failed", | ||
934 | "compact_stall", | ||
935 | "compact_fail", | ||
936 | "compact_success", | ||
937 | #endif | ||
938 | |||
939 | #ifdef CONFIG_HUGETLB_PAGE | ||
940 | "htlb_buddy_alloc_success", | ||
941 | "htlb_buddy_alloc_fail", | ||
942 | #endif | ||
943 | "unevictable_pgs_culled", | ||
944 | "unevictable_pgs_scanned", | ||
945 | "unevictable_pgs_rescued", | ||
946 | "unevictable_pgs_mlocked", | ||
947 | "unevictable_pgs_munlocked", | ||
948 | "unevictable_pgs_cleared", | ||
949 | "unevictable_pgs_stranded", | ||
950 | "unevictable_pgs_mlockfreed", | ||
951 | |||
952 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
953 | "thp_fault_alloc", | ||
954 | "thp_fault_fallback", | ||
955 | "thp_collapse_alloc", | ||
956 | "thp_collapse_alloc_failed", | ||
957 | "thp_split", | ||
958 | #endif | ||
959 | |||
960 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | ||
961 | }; | ||
962 | |||
963 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 966 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
964 | struct zone *zone) | 967 | struct zone *zone) |
965 | { | 968 | { |
@@ -1198,7 +1201,6 @@ static int __init setup_vmstat(void) | |||
1198 | #ifdef CONFIG_SMP | 1201 | #ifdef CONFIG_SMP |
1199 | int cpu; | 1202 | int cpu; |
1200 | 1203 | ||
1201 | refresh_zone_stat_thresholds(); | ||
1202 | register_cpu_notifier(&vmstat_notifier); | 1204 | register_cpu_notifier(&vmstat_notifier); |
1203 | 1205 | ||
1204 | for_each_online_cpu(cpu) | 1206 | for_each_online_cpu(cpu) |