diff options
author | H. Peter Anvin <hpa@linux.intel.com> | 2013-01-29 17:59:09 -0500 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2013-01-29 18:10:15 -0500 |
commit | de65d816aa44f9ddd79861ae21d75010cc1fd003 (patch) | |
tree | 04a637a43b2e52a733d0dcb7595a47057571e7da /mm | |
parent | 9710f581bb4c35589ac046b0cfc0deb7f369fc85 (diff) | |
parent | 5dcd14ecd41ea2b3ae3295a9b30d98769d52165f (diff) |
Merge remote-tracking branch 'origin/x86/boot' into x86/mm2
Coming patches to x86/mm2 require the changes and advanced baseline in
x86/boot.
Resolved Conflicts:
arch/x86/kernel/setup.c
mm/nobootmem.c
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 34 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 302 | ||||
-rw-r--r-- | mm/bootmem.c | 103 | ||||
-rw-r--r-- | mm/compaction.c | 166 | ||||
-rw-r--r-- | mm/dmapool.c | 55 | ||||
-rw-r--r-- | mm/highmem.c | 30 | ||||
-rw-r--r-- | mm/huge_memory.c | 658 | ||||
-rw-r--r-- | mm/hugetlb.c | 63 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 42 | ||||
-rw-r--r-- | mm/internal.h | 13 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/ksm.c | 37 | ||||
-rw-r--r-- | mm/memblock.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 1483 | ||||
-rw-r--r-- | mm/memory-failure.c | 43 | ||||
-rw-r--r-- | mm/memory.c | 251 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 430 | ||||
-rw-r--r-- | mm/mempolicy.c | 470 | ||||
-rw-r--r-- | mm/migrate.c | 450 | ||||
-rw-r--r-- | mm/mmap.c | 569 | ||||
-rw-r--r-- | mm/mprotect.c | 151 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nobootmem.c | 21 | ||||
-rw-r--r-- | mm/nommu.c | 15 | ||||
-rw-r--r-- | mm/oom_kill.c | 138 | ||||
-rw-r--r-- | mm/page-writeback.c | 36 | ||||
-rw-r--r-- | mm/page_alloc.c | 421 | ||||
-rw-r--r-- | mm/page_cgroup.c | 5 | ||||
-rw-r--r-- | mm/page_isolation.c | 53 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 134 | ||||
-rw-r--r-- | mm/shmem.c | 122 | ||||
-rw-r--r-- | mm/slab.c | 383 | ||||
-rw-r--r-- | mm/slab.h | 190 | ||||
-rw-r--r-- | mm/slab_common.c | 292 | ||||
-rw-r--r-- | mm/slob.c | 48 | ||||
-rw-r--r-- | mm/slub.c | 451 | ||||
-rw-r--r-- | mm/sparse.c | 35 | ||||
-rw-r--r-- | mm/swapfile.c | 31 | ||||
-rw-r--r-- | mm/truncate.c | 23 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 242 | ||||
-rw-r--r-- | mm/vmstat.c | 28 |
47 files changed, 5954 insertions, 2099 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a3f8dddaaab3..278e3ab1f169 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -143,6 +143,25 @@ config NO_BOOTMEM | |||
143 | config MEMORY_ISOLATION | 143 | config MEMORY_ISOLATION |
144 | boolean | 144 | boolean |
145 | 145 | ||
146 | config MOVABLE_NODE | ||
147 | boolean "Enable to assign a node which has only movable memory" | ||
148 | depends on HAVE_MEMBLOCK | ||
149 | depends on NO_BOOTMEM | ||
150 | depends on X86_64 | ||
151 | depends on NUMA | ||
152 | default n | ||
153 | help | ||
154 | Allow a node to have only movable memory. Pages used by the kernel, | ||
155 | such as direct mapping pages cannot be migrated. So the corresponding | ||
156 | memory device cannot be hotplugged. This option allows users to | ||
157 | online all the memory of a node as movable memory so that the whole | ||
158 | node can be hotplugged. Users who don't use the memory hotplug | ||
159 | feature are fine with this option on since they don't online memory | ||
160 | as movable. | ||
161 | |||
162 | Say Y here if you want to hotplug a whole node. | ||
163 | Say N here if you want kernel to use memory on all nodes evenly. | ||
164 | |||
146 | # eventually, we can have this option just 'select SPARSEMEM' | 165 | # eventually, we can have this option just 'select SPARSEMEM' |
147 | config MEMORY_HOTPLUG | 166 | config MEMORY_HOTPLUG |
148 | bool "Allow for memory hot-add" | 167 | bool "Allow for memory hot-add" |
@@ -188,6 +207,21 @@ config SPLIT_PTLOCK_CPUS | |||
188 | default "4" | 207 | default "4" |
189 | 208 | ||
190 | # | 209 | # |
210 | # support for memory balloon compaction | ||
211 | config BALLOON_COMPACTION | ||
212 | bool "Allow for balloon memory compaction/migration" | ||
213 | def_bool y | ||
214 | depends on COMPACTION && VIRTIO_BALLOON | ||
215 | help | ||
216 | Memory fragmentation introduced by ballooning might reduce | ||
217 | significantly the number of 2MB contiguous memory blocks that can be | ||
218 | used within a guest, thus imposing performance penalties associated | ||
219 | with the reduced number of transparent huge pages that could be used | ||
220 | by the guest workload. Allowing the compaction & migration for memory | ||
221 | pages enlisted as being part of memory balloon devices avoids the | ||
222 | scenario aforementioned and helps improving memory defragmentation. | ||
223 | |||
224 | # | ||
191 | # support for memory compaction | 225 | # support for memory compaction |
192 | config COMPACTION | 226 | config COMPACTION |
193 | bool "Allow for memory compaction" | 227 | bool "Allow for memory compaction" |
diff --git a/mm/Makefile b/mm/Makefile index 6b025f80af34..3a4628751f89 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o interval_tree.o $(mmu-y) | 19 | compaction.o balloon_compaction.o \ |
20 | interval_tree.o $(mmu-y) | ||
20 | 21 | ||
21 | obj-y += init-mm.o | 22 | obj-y += init-mm.o |
22 | 23 | ||
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 000000000000..07dbc8ec46cf --- /dev/null +++ b/mm/balloon_compaction.c | |||
@@ -0,0 +1,302 @@ | |||
1 | /* | ||
2 | * mm/balloon_compaction.c | ||
3 | * | ||
4 | * Common interface for making balloon pages movable by compaction. | ||
5 | * | ||
6 | * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> | ||
7 | */ | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/export.h> | ||
11 | #include <linux/balloon_compaction.h> | ||
12 | |||
13 | /* | ||
14 | * balloon_devinfo_alloc - allocates a balloon device information descriptor. | ||
15 | * @balloon_dev_descriptor: pointer to reference the balloon device which | ||
16 | * this struct balloon_dev_info will be servicing. | ||
17 | * | ||
18 | * Driver must call it to properly allocate and initialize an instance of | ||
19 | * struct balloon_dev_info which will be used to reference a balloon device | ||
20 | * as well as to keep track of the balloon device page list. | ||
21 | */ | ||
22 | struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) | ||
23 | { | ||
24 | struct balloon_dev_info *b_dev_info; | ||
25 | b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); | ||
26 | if (!b_dev_info) | ||
27 | return ERR_PTR(-ENOMEM); | ||
28 | |||
29 | b_dev_info->balloon_device = balloon_dev_descriptor; | ||
30 | b_dev_info->mapping = NULL; | ||
31 | b_dev_info->isolated_pages = 0; | ||
32 | spin_lock_init(&b_dev_info->pages_lock); | ||
33 | INIT_LIST_HEAD(&b_dev_info->pages); | ||
34 | |||
35 | return b_dev_info; | ||
36 | } | ||
37 | EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); | ||
38 | |||
39 | /* | ||
40 | * balloon_page_enqueue - allocates a new page and inserts it into the balloon | ||
41 | * page list. | ||
42 | * @b_dev_info: balloon device decriptor where we will insert a new page to | ||
43 | * | ||
44 | * Driver must call it to properly allocate a new enlisted balloon page | ||
45 | * before definetively removing it from the guest system. | ||
46 | * This function returns the page address for the recently enqueued page or | ||
47 | * NULL in the case we fail to allocate a new page this turn. | ||
48 | */ | ||
49 | struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) | ||
50 | { | ||
51 | unsigned long flags; | ||
52 | struct page *page = alloc_page(balloon_mapping_gfp_mask() | | ||
53 | __GFP_NOMEMALLOC | __GFP_NORETRY); | ||
54 | if (!page) | ||
55 | return NULL; | ||
56 | |||
57 | /* | ||
58 | * Block others from accessing the 'page' when we get around to | ||
59 | * establishing additional references. We should be the only one | ||
60 | * holding a reference to the 'page' at this point. | ||
61 | */ | ||
62 | BUG_ON(!trylock_page(page)); | ||
63 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
64 | balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); | ||
65 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
66 | unlock_page(page); | ||
67 | return page; | ||
68 | } | ||
69 | EXPORT_SYMBOL_GPL(balloon_page_enqueue); | ||
70 | |||
71 | /* | ||
72 | * balloon_page_dequeue - removes a page from balloon's page list and returns | ||
73 | * the its address to allow the driver release the page. | ||
74 | * @b_dev_info: balloon device decriptor where we will grab a page from. | ||
75 | * | ||
76 | * Driver must call it to properly de-allocate a previous enlisted balloon page | ||
77 | * before definetively releasing it back to the guest system. | ||
78 | * This function returns the page address for the recently dequeued page or | ||
79 | * NULL in the case we find balloon's page list temporarily empty due to | ||
80 | * compaction isolated pages. | ||
81 | */ | ||
82 | struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | ||
83 | { | ||
84 | struct page *page, *tmp; | ||
85 | unsigned long flags; | ||
86 | bool dequeued_page; | ||
87 | |||
88 | dequeued_page = false; | ||
89 | list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { | ||
90 | /* | ||
91 | * Block others from accessing the 'page' while we get around | ||
92 | * establishing additional references and preparing the 'page' | ||
93 | * to be released by the balloon driver. | ||
94 | */ | ||
95 | if (trylock_page(page)) { | ||
96 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
97 | /* | ||
98 | * Raise the page refcount here to prevent any wrong | ||
99 | * attempt to isolate this page, in case of coliding | ||
100 | * with balloon_page_isolate() just after we release | ||
101 | * the page lock. | ||
102 | * | ||
103 | * balloon_page_free() will take care of dropping | ||
104 | * this extra refcount later. | ||
105 | */ | ||
106 | get_page(page); | ||
107 | balloon_page_delete(page); | ||
108 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
109 | unlock_page(page); | ||
110 | dequeued_page = true; | ||
111 | break; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | if (!dequeued_page) { | ||
116 | /* | ||
117 | * If we are unable to dequeue a balloon page because the page | ||
118 | * list is empty and there is no isolated pages, then something | ||
119 | * went out of track and some balloon pages are lost. | ||
120 | * BUG() here, otherwise the balloon driver may get stuck into | ||
121 | * an infinite loop while attempting to release all its pages. | ||
122 | */ | ||
123 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
124 | if (unlikely(list_empty(&b_dev_info->pages) && | ||
125 | !b_dev_info->isolated_pages)) | ||
126 | BUG(); | ||
127 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
128 | page = NULL; | ||
129 | } | ||
130 | return page; | ||
131 | } | ||
132 | EXPORT_SYMBOL_GPL(balloon_page_dequeue); | ||
133 | |||
134 | #ifdef CONFIG_BALLOON_COMPACTION | ||
135 | /* | ||
136 | * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. | ||
137 | * @b_dev_info: holds the balloon device information descriptor. | ||
138 | * @a_ops: balloon_mapping address_space_operations descriptor. | ||
139 | * | ||
140 | * Driver must call it to properly allocate and initialize an instance of | ||
141 | * struct address_space which will be used as the special page->mapping for | ||
142 | * balloon device enlisted page instances. | ||
143 | */ | ||
144 | struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, | ||
145 | const struct address_space_operations *a_ops) | ||
146 | { | ||
147 | struct address_space *mapping; | ||
148 | |||
149 | mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); | ||
150 | if (!mapping) | ||
151 | return ERR_PTR(-ENOMEM); | ||
152 | |||
153 | /* | ||
154 | * Give a clean 'zeroed' status to all elements of this special | ||
155 | * balloon page->mapping struct address_space instance. | ||
156 | */ | ||
157 | address_space_init_once(mapping); | ||
158 | |||
159 | /* | ||
160 | * Set mapping->flags appropriately, to allow balloon pages | ||
161 | * ->mapping identification. | ||
162 | */ | ||
163 | mapping_set_balloon(mapping); | ||
164 | mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); | ||
165 | |||
166 | /* balloon's page->mapping->a_ops callback descriptor */ | ||
167 | mapping->a_ops = a_ops; | ||
168 | |||
169 | /* | ||
170 | * Establish a pointer reference back to the balloon device descriptor | ||
171 | * this particular page->mapping will be servicing. | ||
172 | * This is used by compaction / migration procedures to identify and | ||
173 | * access the balloon device pageset while isolating / migrating pages. | ||
174 | * | ||
175 | * As some balloon drivers can register multiple balloon devices | ||
176 | * for a single guest, this also helps compaction / migration to | ||
177 | * properly deal with multiple balloon pagesets, when required. | ||
178 | */ | ||
179 | mapping->private_data = b_dev_info; | ||
180 | b_dev_info->mapping = mapping; | ||
181 | |||
182 | return mapping; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(balloon_mapping_alloc); | ||
185 | |||
186 | static inline void __isolate_balloon_page(struct page *page) | ||
187 | { | ||
188 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | ||
189 | unsigned long flags; | ||
190 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
191 | list_del(&page->lru); | ||
192 | b_dev_info->isolated_pages++; | ||
193 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
194 | } | ||
195 | |||
196 | static inline void __putback_balloon_page(struct page *page) | ||
197 | { | ||
198 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | ||
199 | unsigned long flags; | ||
200 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
201 | list_add(&page->lru, &b_dev_info->pages); | ||
202 | b_dev_info->isolated_pages--; | ||
203 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
204 | } | ||
205 | |||
206 | static inline int __migrate_balloon_page(struct address_space *mapping, | ||
207 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
208 | { | ||
209 | return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); | ||
210 | } | ||
211 | |||
212 | /* __isolate_lru_page() counterpart for a ballooned page */ | ||
213 | bool balloon_page_isolate(struct page *page) | ||
214 | { | ||
215 | /* | ||
216 | * Avoid burning cycles with pages that are yet under __free_pages(), | ||
217 | * or just got freed under us. | ||
218 | * | ||
219 | * In case we 'win' a race for a balloon page being freed under us and | ||
220 | * raise its refcount preventing __free_pages() from doing its job | ||
221 | * the put_page() at the end of this block will take care of | ||
222 | * release this page, thus avoiding a nasty leakage. | ||
223 | */ | ||
224 | if (likely(get_page_unless_zero(page))) { | ||
225 | /* | ||
226 | * As balloon pages are not isolated from LRU lists, concurrent | ||
227 | * compaction threads can race against page migration functions | ||
228 | * as well as race against the balloon driver releasing a page. | ||
229 | * | ||
230 | * In order to avoid having an already isolated balloon page | ||
231 | * being (wrongly) re-isolated while it is under migration, | ||
232 | * or to avoid attempting to isolate pages being released by | ||
233 | * the balloon driver, lets be sure we have the page lock | ||
234 | * before proceeding with the balloon page isolation steps. | ||
235 | */ | ||
236 | if (likely(trylock_page(page))) { | ||
237 | /* | ||
238 | * A ballooned page, by default, has just one refcount. | ||
239 | * Prevent concurrent compaction threads from isolating | ||
240 | * an already isolated balloon page by refcount check. | ||
241 | */ | ||
242 | if (__is_movable_balloon_page(page) && | ||
243 | page_count(page) == 2) { | ||
244 | __isolate_balloon_page(page); | ||
245 | unlock_page(page); | ||
246 | return true; | ||
247 | } | ||
248 | unlock_page(page); | ||
249 | } | ||
250 | put_page(page); | ||
251 | } | ||
252 | return false; | ||
253 | } | ||
254 | |||
255 | /* putback_lru_page() counterpart for a ballooned page */ | ||
256 | void balloon_page_putback(struct page *page) | ||
257 | { | ||
258 | /* | ||
259 | * 'lock_page()' stabilizes the page and prevents races against | ||
260 | * concurrent isolation threads attempting to re-isolate it. | ||
261 | */ | ||
262 | lock_page(page); | ||
263 | |||
264 | if (__is_movable_balloon_page(page)) { | ||
265 | __putback_balloon_page(page); | ||
266 | /* drop the extra ref count taken for page isolation */ | ||
267 | put_page(page); | ||
268 | } else { | ||
269 | WARN_ON(1); | ||
270 | dump_page(page); | ||
271 | } | ||
272 | unlock_page(page); | ||
273 | } | ||
274 | |||
275 | /* move_to_new_page() counterpart for a ballooned page */ | ||
276 | int balloon_page_migrate(struct page *newpage, | ||
277 | struct page *page, enum migrate_mode mode) | ||
278 | { | ||
279 | struct address_space *mapping; | ||
280 | int rc = -EAGAIN; | ||
281 | |||
282 | /* | ||
283 | * Block others from accessing the 'newpage' when we get around to | ||
284 | * establishing additional references. We should be the only one | ||
285 | * holding a reference to the 'newpage' at this point. | ||
286 | */ | ||
287 | BUG_ON(!trylock_page(newpage)); | ||
288 | |||
289 | if (WARN_ON(!__is_movable_balloon_page(page))) { | ||
290 | dump_page(page); | ||
291 | unlock_page(newpage); | ||
292 | return rc; | ||
293 | } | ||
294 | |||
295 | mapping = page->mapping; | ||
296 | if (mapping) | ||
297 | rc = __migrate_balloon_page(mapping, newpage, page, mode); | ||
298 | |||
299 | unlock_page(newpage); | ||
300 | return rc; | ||
301 | } | ||
302 | #endif /* CONFIG_BALLOON_COMPACTION */ | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..b93376c39b61 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
147 | 147 | ||
148 | /* | 148 | /* |
149 | * free_bootmem_late - free bootmem pages directly to page allocator | 149 | * free_bootmem_late - free bootmem pages directly to page allocator |
150 | * @addr: starting address of the range | 150 | * @addr: starting physical address of the range |
151 | * @size: size of the range in bytes | 151 | * @size: size of the range in bytes |
152 | * | 152 | * |
153 | * This is only useful when the bootmem allocator has already been torn | 153 | * This is only useful when the bootmem allocator has already been torn |
154 | * down, but we are still initializing the system. Pages are given directly | 154 | * down, but we are still initializing the system. Pages are given directly |
155 | * to the page allocator, no bootmem metadata is updated because it is gone. | 155 | * to the page allocator, no bootmem metadata is updated because it is gone. |
156 | */ | 156 | */ |
157 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | 157 | void __init free_bootmem_late(unsigned long physaddr, unsigned long size) |
158 | { | 158 | { |
159 | unsigned long cursor, end; | 159 | unsigned long cursor, end; |
160 | 160 | ||
161 | kmemleak_free_part(__va(addr), size); | 161 | kmemleak_free_part(__va(physaddr), size); |
162 | 162 | ||
163 | cursor = PFN_UP(addr); | 163 | cursor = PFN_UP(physaddr); |
164 | end = PFN_DOWN(addr + size); | 164 | end = PFN_DOWN(physaddr + size); |
165 | 165 | ||
166 | for (; cursor < end; cursor++) { | 166 | for (; cursor < end; cursor++) { |
167 | __free_pages_bootmem(pfn_to_page(cursor), 0); | 167 | __free_pages_bootmem(pfn_to_page(cursor), 0); |
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
185 | 185 | ||
186 | while (start < end) { | 186 | while (start < end) { |
187 | unsigned long *map, idx, vec; | 187 | unsigned long *map, idx, vec; |
188 | unsigned shift; | ||
188 | 189 | ||
189 | map = bdata->node_bootmem_map; | 190 | map = bdata->node_bootmem_map; |
190 | idx = start - bdata->node_min_pfn; | 191 | idx = start - bdata->node_min_pfn; |
192 | shift = idx & (BITS_PER_LONG - 1); | ||
193 | /* | ||
194 | * vec holds at most BITS_PER_LONG map bits, | ||
195 | * bit 0 corresponds to start. | ||
196 | */ | ||
191 | vec = ~map[idx / BITS_PER_LONG]; | 197 | vec = ~map[idx / BITS_PER_LONG]; |
198 | |||
199 | if (shift) { | ||
200 | vec >>= shift; | ||
201 | if (end - start >= BITS_PER_LONG) | ||
202 | vec |= ~map[idx / BITS_PER_LONG + 1] << | ||
203 | (BITS_PER_LONG - shift); | ||
204 | } | ||
192 | /* | 205 | /* |
193 | * If we have a properly aligned and fully unreserved | 206 | * If we have a properly aligned and fully unreserved |
194 | * BITS_PER_LONG block of pages in front of us, free | 207 | * BITS_PER_LONG block of pages in front of us, free |
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
201 | count += BITS_PER_LONG; | 214 | count += BITS_PER_LONG; |
202 | start += BITS_PER_LONG; | 215 | start += BITS_PER_LONG; |
203 | } else { | 216 | } else { |
204 | unsigned long off = 0; | 217 | unsigned long cur = start; |
205 | 218 | ||
206 | vec >>= start & (BITS_PER_LONG - 1); | 219 | start = ALIGN(start + 1, BITS_PER_LONG); |
207 | while (vec) { | 220 | while (vec && cur != start) { |
208 | if (vec & 1) { | 221 | if (vec & 1) { |
209 | page = pfn_to_page(start + off); | 222 | page = pfn_to_page(cur); |
210 | __free_pages_bootmem(page, 0); | 223 | __free_pages_bootmem(page, 0); |
211 | count++; | 224 | count++; |
212 | } | 225 | } |
213 | vec >>= 1; | 226 | vec >>= 1; |
214 | off++; | 227 | ++cur; |
215 | } | 228 | } |
216 | start = ALIGN(start + 1, BITS_PER_LONG); | ||
217 | } | 229 | } |
218 | } | 230 | } |
219 | 231 | ||
@@ -229,6 +241,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
229 | return count; | 241 | return count; |
230 | } | 242 | } |
231 | 243 | ||
244 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
245 | { | ||
246 | struct zone *z; | ||
247 | |||
248 | /* | ||
249 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
250 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
251 | * zones. So there's no need to recalculate managed_pages because all | ||
252 | * highmem pages will be managed by the buddy system. Here highmem | ||
253 | * zone also includes highmem movable zone. | ||
254 | */ | ||
255 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
256 | if (!is_highmem(z)) | ||
257 | z->managed_pages = 0; | ||
258 | } | ||
259 | |||
232 | /** | 260 | /** |
233 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 261 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
234 | * @pgdat: node to be released | 262 | * @pgdat: node to be released |
@@ -238,6 +266,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
238 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 266 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
239 | { | 267 | { |
240 | register_page_bootmem_info_node(pgdat); | 268 | register_page_bootmem_info_node(pgdat); |
269 | reset_node_lowmem_managed_pages(pgdat); | ||
241 | return free_all_bootmem_core(pgdat->bdata); | 270 | return free_all_bootmem_core(pgdat->bdata); |
242 | } | 271 | } |
243 | 272 | ||
@@ -250,6 +279,10 @@ unsigned long __init free_all_bootmem(void) | |||
250 | { | 279 | { |
251 | unsigned long total_pages = 0; | 280 | unsigned long total_pages = 0; |
252 | bootmem_data_t *bdata; | 281 | bootmem_data_t *bdata; |
282 | struct pglist_data *pgdat; | ||
283 | |||
284 | for_each_online_pgdat(pgdat) | ||
285 | reset_node_lowmem_managed_pages(pgdat); | ||
253 | 286 | ||
254 | list_for_each_entry(bdata, &bdata_list, list) | 287 | list_for_each_entry(bdata, &bdata_list, list) |
255 | total_pages += free_all_bootmem_core(bdata); | 288 | total_pages += free_all_bootmem_core(bdata); |
@@ -377,21 +410,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
377 | 410 | ||
378 | /** | 411 | /** |
379 | * free_bootmem - mark a page range as usable | 412 | * free_bootmem - mark a page range as usable |
380 | * @addr: starting address of the range | 413 | * @addr: starting physical address of the range |
381 | * @size: size of the range in bytes | 414 | * @size: size of the range in bytes |
382 | * | 415 | * |
383 | * Partial pages will be considered reserved and left as they are. | 416 | * Partial pages will be considered reserved and left as they are. |
384 | * | 417 | * |
385 | * The range must be contiguous but may span node boundaries. | 418 | * The range must be contiguous but may span node boundaries. |
386 | */ | 419 | */ |
387 | void __init free_bootmem(unsigned long addr, unsigned long size) | 420 | void __init free_bootmem(unsigned long physaddr, unsigned long size) |
388 | { | 421 | { |
389 | unsigned long start, end; | 422 | unsigned long start, end; |
390 | 423 | ||
391 | kmemleak_free_part(__va(addr), size); | 424 | kmemleak_free_part(__va(physaddr), size); |
392 | 425 | ||
393 | start = PFN_UP(addr); | 426 | start = PFN_UP(physaddr); |
394 | end = PFN_DOWN(addr + size); | 427 | end = PFN_DOWN(physaddr + size); |
395 | 428 | ||
396 | mark_bootmem(start, end, 0, 0); | 429 | mark_bootmem(start, end, 0, 0); |
397 | } | 430 | } |
@@ -439,12 +472,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, | |||
439 | return mark_bootmem(start, end, 1, flags); | 472 | return mark_bootmem(start, end, 1, flags); |
440 | } | 473 | } |
441 | 474 | ||
442 | int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
443 | int flags) | ||
444 | { | ||
445 | return reserve_bootmem(phys, len, flags); | ||
446 | } | ||
447 | |||
448 | static unsigned long __init align_idx(struct bootmem_data *bdata, | 475 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
449 | unsigned long idx, unsigned long step) | 476 | unsigned long idx, unsigned long step) |
450 | { | 477 | { |
@@ -575,27 +602,6 @@ find_block: | |||
575 | return NULL; | 602 | return NULL; |
576 | } | 603 | } |
577 | 604 | ||
578 | static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | ||
579 | unsigned long size, unsigned long align, | ||
580 | unsigned long goal, unsigned long limit) | ||
581 | { | ||
582 | if (WARN_ON_ONCE(slab_is_available())) | ||
583 | return kzalloc(size, GFP_NOWAIT); | ||
584 | |||
585 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM | ||
586 | { | ||
587 | bootmem_data_t *p_bdata; | ||
588 | |||
589 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | ||
590 | goal, limit); | ||
591 | if (p_bdata) | ||
592 | return alloc_bootmem_bdata(p_bdata, size, align, | ||
593 | goal, limit); | ||
594 | } | ||
595 | #endif | ||
596 | return NULL; | ||
597 | } | ||
598 | |||
599 | static void * __init alloc_bootmem_core(unsigned long size, | 605 | static void * __init alloc_bootmem_core(unsigned long size, |
600 | unsigned long align, | 606 | unsigned long align, |
601 | unsigned long goal, | 607 | unsigned long goal, |
@@ -604,9 +610,8 @@ static void * __init alloc_bootmem_core(unsigned long size, | |||
604 | bootmem_data_t *bdata; | 610 | bootmem_data_t *bdata; |
605 | void *region; | 611 | void *region; |
606 | 612 | ||
607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); | 613 | if (WARN_ON_ONCE(slab_is_available())) |
608 | if (region) | 614 | return kzalloc(size, GFP_NOWAIT); |
609 | return region; | ||
610 | 615 | ||
611 | list_for_each_entry(bdata, &bdata_list, list) { | 616 | list_for_each_entry(bdata, &bdata_list, list) { |
612 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) | 617 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) |
@@ -704,11 +709,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | |||
704 | { | 709 | { |
705 | void *ptr; | 710 | void *ptr; |
706 | 711 | ||
712 | if (WARN_ON_ONCE(slab_is_available())) | ||
713 | return kzalloc(size, GFP_NOWAIT); | ||
707 | again: | 714 | again: |
708 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, | ||
709 | align, goal, limit); | ||
710 | if (ptr) | ||
711 | return ptr; | ||
712 | 715 | ||
713 | /* do not panic in alloc_bootmem_bdata() */ | 716 | /* do not panic in alloc_bootmem_bdata() */ |
714 | if (limit && goal + size > limit) | 717 | if (limit && goal + size > limit) |
diff --git a/mm/compaction.c b/mm/compaction.c index 9eef55838fca..c62bd063d766 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -14,8 +14,24 @@ | |||
14 | #include <linux/backing-dev.h> | 14 | #include <linux/backing-dev.h> |
15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include <linux/balloon_compaction.h> | ||
17 | #include "internal.h" | 18 | #include "internal.h" |
18 | 19 | ||
20 | #ifdef CONFIG_COMPACTION | ||
21 | static inline void count_compact_event(enum vm_event_item item) | ||
22 | { | ||
23 | count_vm_event(item); | ||
24 | } | ||
25 | |||
26 | static inline void count_compact_events(enum vm_event_item item, long delta) | ||
27 | { | ||
28 | count_vm_events(item, delta); | ||
29 | } | ||
30 | #else | ||
31 | #define count_compact_event(item) do { } while (0) | ||
32 | #define count_compact_events(item, delta) do { } while (0) | ||
33 | #endif | ||
34 | |||
19 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 35 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
20 | 36 | ||
21 | #define CREATE_TRACE_POINTS | 37 | #define CREATE_TRACE_POINTS |
@@ -214,60 +230,6 @@ static bool suitable_migration_target(struct page *page) | |||
214 | return false; | 230 | return false; |
215 | } | 231 | } |
216 | 232 | ||
217 | static void compact_capture_page(struct compact_control *cc) | ||
218 | { | ||
219 | unsigned long flags; | ||
220 | int mtype, mtype_low, mtype_high; | ||
221 | |||
222 | if (!cc->page || *cc->page) | ||
223 | return; | ||
224 | |||
225 | /* | ||
226 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
227 | * regardless of the migratetype of the freelist is is captured from. | ||
228 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
229 | * allocation is typically at least a pageblock size and overall | ||
230 | * fragmentation is not impaired. Other allocation types must | ||
231 | * capture pages from their own migratelist because otherwise they | ||
232 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
233 | * difficult to move pages and making fragmentation worse overall. | ||
234 | */ | ||
235 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
236 | mtype_low = 0; | ||
237 | mtype_high = MIGRATE_PCPTYPES; | ||
238 | } else { | ||
239 | mtype_low = cc->migratetype; | ||
240 | mtype_high = cc->migratetype + 1; | ||
241 | } | ||
242 | |||
243 | /* Speculatively examine the free lists without zone lock */ | ||
244 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
245 | int order; | ||
246 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
247 | struct page *page; | ||
248 | struct free_area *area; | ||
249 | area = &(cc->zone->free_area[order]); | ||
250 | if (list_empty(&area->free_list[mtype])) | ||
251 | continue; | ||
252 | |||
253 | /* Take the lock and attempt capture of the page */ | ||
254 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
255 | return; | ||
256 | if (!list_empty(&area->free_list[mtype])) { | ||
257 | page = list_entry(area->free_list[mtype].next, | ||
258 | struct page, lru); | ||
259 | if (capture_free_page(page, cc->order, mtype)) { | ||
260 | spin_unlock_irqrestore(&cc->zone->lock, | ||
261 | flags); | ||
262 | *cc->page = page; | ||
263 | return; | ||
264 | } | ||
265 | } | ||
266 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
267 | } | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* | 233 | /* |
272 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 234 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
273 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 235 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
@@ -356,6 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
356 | if (blockpfn == end_pfn) | 318 | if (blockpfn == end_pfn) |
357 | update_pageblock_skip(cc, valid_page, total_isolated, false); | 319 | update_pageblock_skip(cc, valid_page, total_isolated, false); |
358 | 320 | ||
321 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); | ||
322 | if (total_isolated) | ||
323 | count_compact_events(COMPACTISOLATED, total_isolated); | ||
359 | return total_isolated; | 324 | return total_isolated; |
360 | } | 325 | } |
361 | 326 | ||
@@ -565,9 +530,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
565 | goto next_pageblock; | 530 | goto next_pageblock; |
566 | } | 531 | } |
567 | 532 | ||
568 | /* Check may be lockless but that's ok as we recheck later */ | 533 | /* |
569 | if (!PageLRU(page)) | 534 | * Check may be lockless but that's ok as we recheck later. |
535 | * It's possible to migrate LRU pages and balloon pages | ||
536 | * Skip any other type of page | ||
537 | */ | ||
538 | if (!PageLRU(page)) { | ||
539 | if (unlikely(balloon_page_movable(page))) { | ||
540 | if (locked && balloon_page_isolate(page)) { | ||
541 | /* Successfully isolated */ | ||
542 | cc->finished_update_migrate = true; | ||
543 | list_add(&page->lru, migratelist); | ||
544 | cc->nr_migratepages++; | ||
545 | nr_isolated++; | ||
546 | goto check_compact_cluster; | ||
547 | } | ||
548 | } | ||
570 | continue; | 549 | continue; |
550 | } | ||
571 | 551 | ||
572 | /* | 552 | /* |
573 | * PageLRU is set. lru_lock normally excludes isolation | 553 | * PageLRU is set. lru_lock normally excludes isolation |
@@ -621,6 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
621 | cc->nr_migratepages++; | 601 | cc->nr_migratepages++; |
622 | nr_isolated++; | 602 | nr_isolated++; |
623 | 603 | ||
604 | check_compact_cluster: | ||
624 | /* Avoid isolating too much */ | 605 | /* Avoid isolating too much */ |
625 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | 606 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
626 | ++low_pfn; | 607 | ++low_pfn; |
@@ -646,6 +627,10 @@ next_pageblock: | |||
646 | 627 | ||
647 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 628 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
648 | 629 | ||
630 | count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); | ||
631 | if (nr_isolated) | ||
632 | count_compact_events(COMPACTISOLATED, nr_isolated); | ||
633 | |||
649 | return low_pfn; | 634 | return low_pfn; |
650 | } | 635 | } |
651 | 636 | ||
@@ -713,7 +698,15 @@ static void isolate_freepages(struct zone *zone, | |||
713 | 698 | ||
714 | /* Found a block suitable for isolating free pages from */ | 699 | /* Found a block suitable for isolating free pages from */ |
715 | isolated = 0; | 700 | isolated = 0; |
716 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | 701 | |
702 | /* | ||
703 | * As pfn may not start aligned, pfn+pageblock_nr_page | ||
704 | * may cross a MAX_ORDER_NR_PAGES boundary and miss | ||
705 | * a pfn_valid check. Ensure isolate_freepages_block() | ||
706 | * only scans within a pageblock | ||
707 | */ | ||
708 | end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
709 | end_pfn = min(end_pfn, zone_end_pfn); | ||
717 | isolated = isolate_freepages_block(cc, pfn, end_pfn, | 710 | isolated = isolate_freepages_block(cc, pfn, end_pfn, |
718 | freelist, false); | 711 | freelist, false); |
719 | nr_freepages += isolated; | 712 | nr_freepages += isolated; |
@@ -823,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
823 | static int compact_finished(struct zone *zone, | 816 | static int compact_finished(struct zone *zone, |
824 | struct compact_control *cc) | 817 | struct compact_control *cc) |
825 | { | 818 | { |
819 | unsigned int order; | ||
826 | unsigned long watermark; | 820 | unsigned long watermark; |
827 | 821 | ||
828 | if (fatal_signal_pending(current)) | 822 | if (fatal_signal_pending(current)) |
@@ -857,22 +851,16 @@ static int compact_finished(struct zone *zone, | |||
857 | return COMPACT_CONTINUE; | 851 | return COMPACT_CONTINUE; |
858 | 852 | ||
859 | /* Direct compactor: Is a suitable page free? */ | 853 | /* Direct compactor: Is a suitable page free? */ |
860 | if (cc->page) { | 854 | for (order = cc->order; order < MAX_ORDER; order++) { |
861 | /* Was a suitable page captured? */ | 855 | struct free_area *area = &zone->free_area[order]; |
862 | if (*cc->page) | 856 | |
857 | /* Job done if page is free of the right migratetype */ | ||
858 | if (!list_empty(&area->free_list[cc->migratetype])) | ||
859 | return COMPACT_PARTIAL; | ||
860 | |||
861 | /* Job done if allocation would set block type */ | ||
862 | if (cc->order >= pageblock_order && area->nr_free) | ||
863 | return COMPACT_PARTIAL; | 863 | return COMPACT_PARTIAL; |
864 | } else { | ||
865 | unsigned int order; | ||
866 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
867 | struct free_area *area = &zone->free_area[cc->order]; | ||
868 | /* Job done if page is free of the right migratetype */ | ||
869 | if (!list_empty(&area->free_list[cc->migratetype])) | ||
870 | return COMPACT_PARTIAL; | ||
871 | |||
872 | /* Job done if allocation would set block type */ | ||
873 | if (cc->order >= pageblock_order && area->nr_free) | ||
874 | return COMPACT_PARTIAL; | ||
875 | } | ||
876 | } | 864 | } |
877 | 865 | ||
878 | return COMPACT_CONTINUE; | 866 | return COMPACT_CONTINUE; |
@@ -978,7 +966,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
978 | switch (isolate_migratepages(zone, cc)) { | 966 | switch (isolate_migratepages(zone, cc)) { |
979 | case ISOLATE_ABORT: | 967 | case ISOLATE_ABORT: |
980 | ret = COMPACT_PARTIAL; | 968 | ret = COMPACT_PARTIAL; |
981 | putback_lru_pages(&cc->migratepages); | 969 | putback_movable_pages(&cc->migratepages); |
982 | cc->nr_migratepages = 0; | 970 | cc->nr_migratepages = 0; |
983 | goto out; | 971 | goto out; |
984 | case ISOLATE_NONE: | 972 | case ISOLATE_NONE: |
@@ -990,29 +978,23 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
990 | nr_migrate = cc->nr_migratepages; | 978 | nr_migrate = cc->nr_migratepages; |
991 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 979 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
992 | (unsigned long)cc, false, | 980 | (unsigned long)cc, false, |
993 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); | 981 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, |
982 | MR_COMPACTION); | ||
994 | update_nr_listpages(cc); | 983 | update_nr_listpages(cc); |
995 | nr_remaining = cc->nr_migratepages; | 984 | nr_remaining = cc->nr_migratepages; |
996 | 985 | ||
997 | count_vm_event(COMPACTBLOCKS); | ||
998 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | ||
999 | if (nr_remaining) | ||
1000 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | ||
1001 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 986 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, |
1002 | nr_remaining); | 987 | nr_remaining); |
1003 | 988 | ||
1004 | /* Release LRU pages not migrated */ | 989 | /* Release isolated pages not migrated */ |
1005 | if (err) { | 990 | if (err) { |
1006 | putback_lru_pages(&cc->migratepages); | 991 | putback_movable_pages(&cc->migratepages); |
1007 | cc->nr_migratepages = 0; | 992 | cc->nr_migratepages = 0; |
1008 | if (err == -ENOMEM) { | 993 | if (err == -ENOMEM) { |
1009 | ret = COMPACT_PARTIAL; | 994 | ret = COMPACT_PARTIAL; |
1010 | goto out; | 995 | goto out; |
1011 | } | 996 | } |
1012 | } | 997 | } |
1013 | |||
1014 | /* Capture a page now if it is a suitable size */ | ||
1015 | compact_capture_page(cc); | ||
1016 | } | 998 | } |
1017 | 999 | ||
1018 | out: | 1000 | out: |
@@ -1025,8 +1007,7 @@ out: | |||
1025 | 1007 | ||
1026 | static unsigned long compact_zone_order(struct zone *zone, | 1008 | static unsigned long compact_zone_order(struct zone *zone, |
1027 | int order, gfp_t gfp_mask, | 1009 | int order, gfp_t gfp_mask, |
1028 | bool sync, bool *contended, | 1010 | bool sync, bool *contended) |
1029 | struct page **page) | ||
1030 | { | 1011 | { |
1031 | unsigned long ret; | 1012 | unsigned long ret; |
1032 | struct compact_control cc = { | 1013 | struct compact_control cc = { |
@@ -1036,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
1036 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1017 | .migratetype = allocflags_to_migratetype(gfp_mask), |
1037 | .zone = zone, | 1018 | .zone = zone, |
1038 | .sync = sync, | 1019 | .sync = sync, |
1039 | .page = page, | ||
1040 | }; | 1020 | }; |
1041 | INIT_LIST_HEAD(&cc.freepages); | 1021 | INIT_LIST_HEAD(&cc.freepages); |
1042 | INIT_LIST_HEAD(&cc.migratepages); | 1022 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1066,7 +1046,7 @@ int sysctl_extfrag_threshold = 500; | |||
1066 | */ | 1046 | */ |
1067 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1047 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1068 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1048 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1069 | bool sync, bool *contended, struct page **page) | 1049 | bool sync, bool *contended) |
1070 | { | 1050 | { |
1071 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1051 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1072 | int may_enter_fs = gfp_mask & __GFP_FS; | 1052 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -1080,7 +1060,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1080 | if (!order || !may_enter_fs || !may_perform_io) | 1060 | if (!order || !may_enter_fs || !may_perform_io) |
1081 | return rc; | 1061 | return rc; |
1082 | 1062 | ||
1083 | count_vm_event(COMPACTSTALL); | 1063 | count_compact_event(COMPACTSTALL); |
1084 | 1064 | ||
1085 | #ifdef CONFIG_CMA | 1065 | #ifdef CONFIG_CMA |
1086 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 1066 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
@@ -1092,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1092 | int status; | 1072 | int status; |
1093 | 1073 | ||
1094 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1074 | status = compact_zone_order(zone, order, gfp_mask, sync, |
1095 | contended, page); | 1075 | contended); |
1096 | rc = max(status, rc); | 1076 | rc = max(status, rc); |
1097 | 1077 | ||
1098 | /* If a normal allocation would succeed, stop compacting */ | 1078 | /* If a normal allocation would succeed, stop compacting */ |
@@ -1148,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order) | |||
1148 | struct compact_control cc = { | 1128 | struct compact_control cc = { |
1149 | .order = order, | 1129 | .order = order, |
1150 | .sync = false, | 1130 | .sync = false, |
1151 | .page = NULL, | ||
1152 | }; | 1131 | }; |
1153 | 1132 | ||
1154 | return __compact_pgdat(pgdat, &cc); | 1133 | return __compact_pgdat(pgdat, &cc); |
@@ -1159,14 +1138,13 @@ static int compact_node(int nid) | |||
1159 | struct compact_control cc = { | 1138 | struct compact_control cc = { |
1160 | .order = -1, | 1139 | .order = -1, |
1161 | .sync = true, | 1140 | .sync = true, |
1162 | .page = NULL, | ||
1163 | }; | 1141 | }; |
1164 | 1142 | ||
1165 | return __compact_pgdat(NODE_DATA(nid), &cc); | 1143 | return __compact_pgdat(NODE_DATA(nid), &cc); |
1166 | } | 1144 | } |
1167 | 1145 | ||
1168 | /* Compact all nodes in the system */ | 1146 | /* Compact all nodes in the system */ |
1169 | static int compact_nodes(void) | 1147 | static void compact_nodes(void) |
1170 | { | 1148 | { |
1171 | int nid; | 1149 | int nid; |
1172 | 1150 | ||
@@ -1175,8 +1153,6 @@ static int compact_nodes(void) | |||
1175 | 1153 | ||
1176 | for_each_online_node(nid) | 1154 | for_each_online_node(nid) |
1177 | compact_node(nid); | 1155 | compact_node(nid); |
1178 | |||
1179 | return COMPACT_COMPLETE; | ||
1180 | } | 1156 | } |
1181 | 1157 | ||
1182 | /* The written value is actually unused, all memory is compacted */ | 1158 | /* The written value is actually unused, all memory is compacted */ |
@@ -1187,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, | |||
1187 | void __user *buffer, size_t *length, loff_t *ppos) | 1163 | void __user *buffer, size_t *length, loff_t *ppos) |
1188 | { | 1164 | { |
1189 | if (write) | 1165 | if (write) |
1190 | return compact_nodes(); | 1166 | compact_nodes(); |
1191 | 1167 | ||
1192 | return 0; | 1168 | return 0; |
1193 | } | 1169 | } |
diff --git a/mm/dmapool.c b/mm/dmapool.c index c5ab33bca0a8..c69781e97cf9 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */ | |||
50 | size_t allocation; | 50 | size_t allocation; |
51 | size_t boundary; | 51 | size_t boundary; |
52 | char name[32]; | 52 | char name[32]; |
53 | wait_queue_head_t waitq; | ||
54 | struct list_head pools; | 53 | struct list_head pools; |
55 | }; | 54 | }; |
56 | 55 | ||
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ | |||
62 | unsigned int offset; | 61 | unsigned int offset; |
63 | }; | 62 | }; |
64 | 63 | ||
65 | #define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) | ||
66 | |||
67 | static DEFINE_MUTEX(pools_lock); | 64 | static DEFINE_MUTEX(pools_lock); |
68 | 65 | ||
69 | static ssize_t | 66 | static ssize_t |
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
172 | retval->size = size; | 169 | retval->size = size; |
173 | retval->boundary = boundary; | 170 | retval->boundary = boundary; |
174 | retval->allocation = allocation; | 171 | retval->allocation = allocation; |
175 | init_waitqueue_head(&retval->waitq); | ||
176 | 172 | ||
177 | if (dev) { | 173 | if (dev) { |
178 | int ret; | 174 | int ret; |
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) | |||
227 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); | 223 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); |
228 | #endif | 224 | #endif |
229 | pool_initialise_page(pool, page); | 225 | pool_initialise_page(pool, page); |
230 | list_add(&page->page_list, &pool->page_list); | ||
231 | page->in_use = 0; | 226 | page->in_use = 0; |
232 | page->offset = 0; | 227 | page->offset = 0; |
233 | } else { | 228 | } else { |
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
315 | might_sleep_if(mem_flags & __GFP_WAIT); | 310 | might_sleep_if(mem_flags & __GFP_WAIT); |
316 | 311 | ||
317 | spin_lock_irqsave(&pool->lock, flags); | 312 | spin_lock_irqsave(&pool->lock, flags); |
318 | restart: | ||
319 | list_for_each_entry(page, &pool->page_list, page_list) { | 313 | list_for_each_entry(page, &pool->page_list, page_list) { |
320 | if (page->offset < pool->allocation) | 314 | if (page->offset < pool->allocation) |
321 | goto ready; | 315 | goto ready; |
322 | } | 316 | } |
323 | page = pool_alloc_page(pool, GFP_ATOMIC); | ||
324 | if (!page) { | ||
325 | if (mem_flags & __GFP_WAIT) { | ||
326 | DECLARE_WAITQUEUE(wait, current); | ||
327 | 317 | ||
328 | __set_current_state(TASK_UNINTERRUPTIBLE); | 318 | /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ |
329 | __add_wait_queue(&pool->waitq, &wait); | 319 | spin_unlock_irqrestore(&pool->lock, flags); |
330 | spin_unlock_irqrestore(&pool->lock, flags); | ||
331 | 320 | ||
332 | schedule_timeout(POOL_TIMEOUT_JIFFIES); | 321 | page = pool_alloc_page(pool, mem_flags); |
322 | if (!page) | ||
323 | return NULL; | ||
333 | 324 | ||
334 | spin_lock_irqsave(&pool->lock, flags); | 325 | spin_lock_irqsave(&pool->lock, flags); |
335 | __remove_wait_queue(&pool->waitq, &wait); | ||
336 | goto restart; | ||
337 | } | ||
338 | retval = NULL; | ||
339 | goto done; | ||
340 | } | ||
341 | 326 | ||
327 | list_add(&page->page_list, &pool->page_list); | ||
342 | ready: | 328 | ready: |
343 | page->in_use++; | 329 | page->in_use++; |
344 | offset = page->offset; | 330 | offset = page->offset; |
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
346 | retval = offset + page->vaddr; | 332 | retval = offset + page->vaddr; |
347 | *handle = offset + page->dma; | 333 | *handle = offset + page->dma; |
348 | #ifdef DMAPOOL_DEBUG | 334 | #ifdef DMAPOOL_DEBUG |
335 | { | ||
336 | int i; | ||
337 | u8 *data = retval; | ||
338 | /* page->offset is stored in first 4 bytes */ | ||
339 | for (i = sizeof(page->offset); i < pool->size; i++) { | ||
340 | if (data[i] == POOL_POISON_FREED) | ||
341 | continue; | ||
342 | if (pool->dev) | ||
343 | dev_err(pool->dev, | ||
344 | "dma_pool_alloc %s, %p (corruped)\n", | ||
345 | pool->name, retval); | ||
346 | else | ||
347 | pr_err("dma_pool_alloc %s, %p (corruped)\n", | ||
348 | pool->name, retval); | ||
349 | |||
350 | /* | ||
351 | * Dump the first 4 bytes even if they are not | ||
352 | * POOL_POISON_FREED | ||
353 | */ | ||
354 | print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, | ||
355 | data, pool->size, 1); | ||
356 | break; | ||
357 | } | ||
358 | } | ||
349 | memset(retval, POOL_POISON_ALLOCATED, pool->size); | 359 | memset(retval, POOL_POISON_ALLOCATED, pool->size); |
350 | #endif | 360 | #endif |
351 | done: | ||
352 | spin_unlock_irqrestore(&pool->lock, flags); | 361 | spin_unlock_irqrestore(&pool->lock, flags); |
353 | return retval; | 362 | return retval; |
354 | } | 363 | } |
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
435 | page->in_use--; | 444 | page->in_use--; |
436 | *(int *)vaddr = page->offset; | 445 | *(int *)vaddr = page->offset; |
437 | page->offset = offset; | 446 | page->offset = offset; |
438 | if (waitqueue_active(&pool->waitq)) | ||
439 | wake_up_locked(&pool->waitq); | ||
440 | /* | 447 | /* |
441 | * Resist a temptation to do | 448 | * Resist a temptation to do |
442 | * if (!is_page_busy(page)) pool_free_page(pool, page); | 449 | * if (!is_page_busy(page)) pool_free_page(pool, page); |
diff --git a/mm/highmem.c b/mm/highmem.c index 2da13a5c50e2..b32b70cdaed6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -99,12 +99,13 @@ struct page *kmap_to_page(void *vaddr) | |||
99 | unsigned long addr = (unsigned long)vaddr; | 99 | unsigned long addr = (unsigned long)vaddr; |
100 | 100 | ||
101 | if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { | 101 | if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { |
102 | int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; | 102 | int i = PKMAP_NR(addr); |
103 | return pte_page(pkmap_page_table[i]); | 103 | return pte_page(pkmap_page_table[i]); |
104 | } | 104 | } |
105 | 105 | ||
106 | return virt_to_page(addr); | 106 | return virt_to_page(addr); |
107 | } | 107 | } |
108 | EXPORT_SYMBOL(kmap_to_page); | ||
108 | 109 | ||
109 | static void flush_all_zero_pkmaps(void) | 110 | static void flush_all_zero_pkmaps(void) |
110 | { | 111 | { |
@@ -137,8 +138,7 @@ static void flush_all_zero_pkmaps(void) | |||
137 | * So no dangers, even with speculative execution. | 138 | * So no dangers, even with speculative execution. |
138 | */ | 139 | */ |
139 | page = pte_page(pkmap_page_table[i]); | 140 | page = pte_page(pkmap_page_table[i]); |
140 | pte_clear(&init_mm, (unsigned long)page_address(page), | 141 | pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); |
141 | &pkmap_page_table[i]); | ||
142 | 142 | ||
143 | set_page_address(page, NULL); | 143 | set_page_address(page, NULL); |
144 | need_flush = 1; | 144 | need_flush = 1; |
@@ -324,11 +324,7 @@ struct page_address_map { | |||
324 | struct list_head list; | 324 | struct list_head list; |
325 | }; | 325 | }; |
326 | 326 | ||
327 | /* | 327 | static struct page_address_map page_address_maps[LAST_PKMAP]; |
328 | * page_address_map freelist, allocated from page_address_maps. | ||
329 | */ | ||
330 | static struct list_head page_address_pool; /* freelist */ | ||
331 | static spinlock_t pool_lock; /* protects page_address_pool */ | ||
332 | 328 | ||
333 | /* | 329 | /* |
334 | * Hash table bucket | 330 | * Hash table bucket |
@@ -393,14 +389,7 @@ void set_page_address(struct page *page, void *virtual) | |||
393 | 389 | ||
394 | pas = page_slot(page); | 390 | pas = page_slot(page); |
395 | if (virtual) { /* Add */ | 391 | if (virtual) { /* Add */ |
396 | BUG_ON(list_empty(&page_address_pool)); | 392 | pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; |
397 | |||
398 | spin_lock_irqsave(&pool_lock, flags); | ||
399 | pam = list_entry(page_address_pool.next, | ||
400 | struct page_address_map, list); | ||
401 | list_del(&pam->list); | ||
402 | spin_unlock_irqrestore(&pool_lock, flags); | ||
403 | |||
404 | pam->page = page; | 393 | pam->page = page; |
405 | pam->virtual = virtual; | 394 | pam->virtual = virtual; |
406 | 395 | ||
@@ -413,9 +402,6 @@ void set_page_address(struct page *page, void *virtual) | |||
413 | if (pam->page == page) { | 402 | if (pam->page == page) { |
414 | list_del(&pam->list); | 403 | list_del(&pam->list); |
415 | spin_unlock_irqrestore(&pas->lock, flags); | 404 | spin_unlock_irqrestore(&pas->lock, flags); |
416 | spin_lock_irqsave(&pool_lock, flags); | ||
417 | list_add_tail(&pam->list, &page_address_pool); | ||
418 | spin_unlock_irqrestore(&pool_lock, flags); | ||
419 | goto done; | 405 | goto done; |
420 | } | 406 | } |
421 | } | 407 | } |
@@ -425,20 +411,14 @@ done: | |||
425 | return; | 411 | return; |
426 | } | 412 | } |
427 | 413 | ||
428 | static struct page_address_map page_address_maps[LAST_PKMAP]; | ||
429 | |||
430 | void __init page_address_init(void) | 414 | void __init page_address_init(void) |
431 | { | 415 | { |
432 | int i; | 416 | int i; |
433 | 417 | ||
434 | INIT_LIST_HEAD(&page_address_pool); | ||
435 | for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) | ||
436 | list_add(&page_address_maps[i].list, &page_address_pool); | ||
437 | for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { | 418 | for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { |
438 | INIT_LIST_HEAD(&page_address_htable[i].lh); | 419 | INIT_LIST_HEAD(&page_address_htable[i].lh); |
439 | spin_lock_init(&page_address_htable[i].lock); | 420 | spin_lock_init(&page_address_htable[i].lock); |
440 | } | 421 | } |
441 | spin_lock_init(&pool_lock); | ||
442 | } | 422 | } |
443 | 423 | ||
444 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ | 424 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c34b415..6001ee6347a9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -12,12 +12,15 @@ | |||
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/shrinker.h> | ||
15 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
16 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
17 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
18 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
19 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
20 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | ||
23 | |||
21 | #include <asm/tlb.h> | 24 | #include <asm/tlb.h> |
22 | #include <asm/pgalloc.h> | 25 | #include <asm/pgalloc.h> |
23 | #include "internal.h" | 26 | #include "internal.h" |
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly = | |||
37 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | 40 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| |
38 | #endif | 41 | #endif |
39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | 42 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| |
40 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 43 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
44 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
41 | 45 | ||
42 | /* default scan 8*512 pte (or vmas) every 30 second */ | 46 | /* default scan 8*512 pte (or vmas) every 30 second */ |
43 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | 47 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; |
@@ -159,6 +163,77 @@ static int start_khugepaged(void) | |||
159 | return err; | 163 | return err; |
160 | } | 164 | } |
161 | 165 | ||
166 | static atomic_t huge_zero_refcount; | ||
167 | static unsigned long huge_zero_pfn __read_mostly; | ||
168 | |||
169 | static inline bool is_huge_zero_pfn(unsigned long pfn) | ||
170 | { | ||
171 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | ||
172 | return zero_pfn && pfn == zero_pfn; | ||
173 | } | ||
174 | |||
175 | static inline bool is_huge_zero_pmd(pmd_t pmd) | ||
176 | { | ||
177 | return is_huge_zero_pfn(pmd_pfn(pmd)); | ||
178 | } | ||
179 | |||
180 | static unsigned long get_huge_zero_page(void) | ||
181 | { | ||
182 | struct page *zero_page; | ||
183 | retry: | ||
184 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | ||
185 | return ACCESS_ONCE(huge_zero_pfn); | ||
186 | |||
187 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | ||
188 | HPAGE_PMD_ORDER); | ||
189 | if (!zero_page) { | ||
190 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | ||
191 | return 0; | ||
192 | } | ||
193 | count_vm_event(THP_ZERO_PAGE_ALLOC); | ||
194 | preempt_disable(); | ||
195 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | ||
196 | preempt_enable(); | ||
197 | __free_page(zero_page); | ||
198 | goto retry; | ||
199 | } | ||
200 | |||
201 | /* We take additional reference here. It will be put back by shrinker */ | ||
202 | atomic_set(&huge_zero_refcount, 2); | ||
203 | preempt_enable(); | ||
204 | return ACCESS_ONCE(huge_zero_pfn); | ||
205 | } | ||
206 | |||
207 | static void put_huge_zero_page(void) | ||
208 | { | ||
209 | /* | ||
210 | * Counter should never go to zero here. Only shrinker can put | ||
211 | * last reference. | ||
212 | */ | ||
213 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | ||
214 | } | ||
215 | |||
216 | static int shrink_huge_zero_page(struct shrinker *shrink, | ||
217 | struct shrink_control *sc) | ||
218 | { | ||
219 | if (!sc->nr_to_scan) | ||
220 | /* we can free zero page only if last reference remains */ | ||
221 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | ||
222 | |||
223 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | ||
224 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | ||
225 | BUG_ON(zero_pfn == 0); | ||
226 | __free_page(__pfn_to_page(zero_pfn)); | ||
227 | } | ||
228 | |||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | static struct shrinker huge_zero_page_shrinker = { | ||
233 | .shrink = shrink_huge_zero_page, | ||
234 | .seeks = DEFAULT_SEEKS, | ||
235 | }; | ||
236 | |||
162 | #ifdef CONFIG_SYSFS | 237 | #ifdef CONFIG_SYSFS |
163 | 238 | ||
164 | static ssize_t double_flag_show(struct kobject *kobj, | 239 | static ssize_t double_flag_show(struct kobject *kobj, |
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj, | |||
284 | static struct kobj_attribute defrag_attr = | 359 | static struct kobj_attribute defrag_attr = |
285 | __ATTR(defrag, 0644, defrag_show, defrag_store); | 360 | __ATTR(defrag, 0644, defrag_show, defrag_store); |
286 | 361 | ||
362 | static ssize_t use_zero_page_show(struct kobject *kobj, | ||
363 | struct kobj_attribute *attr, char *buf) | ||
364 | { | ||
365 | return single_flag_show(kobj, attr, buf, | ||
366 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
367 | } | ||
368 | static ssize_t use_zero_page_store(struct kobject *kobj, | ||
369 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
370 | { | ||
371 | return single_flag_store(kobj, attr, buf, count, | ||
372 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
373 | } | ||
374 | static struct kobj_attribute use_zero_page_attr = | ||
375 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); | ||
287 | #ifdef CONFIG_DEBUG_VM | 376 | #ifdef CONFIG_DEBUG_VM |
288 | static ssize_t debug_cow_show(struct kobject *kobj, | 377 | static ssize_t debug_cow_show(struct kobject *kobj, |
289 | struct kobj_attribute *attr, char *buf) | 378 | struct kobj_attribute *attr, char *buf) |
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr = | |||
305 | static struct attribute *hugepage_attr[] = { | 394 | static struct attribute *hugepage_attr[] = { |
306 | &enabled_attr.attr, | 395 | &enabled_attr.attr, |
307 | &defrag_attr.attr, | 396 | &defrag_attr.attr, |
397 | &use_zero_page_attr.attr, | ||
308 | #ifdef CONFIG_DEBUG_VM | 398 | #ifdef CONFIG_DEBUG_VM |
309 | &debug_cow_attr.attr, | 399 | &debug_cow_attr.attr, |
310 | #endif | 400 | #endif |
@@ -484,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) | |||
484 | 574 | ||
485 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); | 575 | *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); |
486 | if (unlikely(!*hugepage_kobj)) { | 576 | if (unlikely(!*hugepage_kobj)) { |
487 | printk(KERN_ERR "hugepage: failed kobject create\n"); | 577 | printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); |
488 | return -ENOMEM; | 578 | return -ENOMEM; |
489 | } | 579 | } |
490 | 580 | ||
491 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); | 581 | err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); |
492 | if (err) { | 582 | if (err) { |
493 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | 583 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); |
494 | goto delete_obj; | 584 | goto delete_obj; |
495 | } | 585 | } |
496 | 586 | ||
497 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); | 587 | err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); |
498 | if (err) { | 588 | if (err) { |
499 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | 589 | printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); |
500 | goto remove_hp_group; | 590 | goto remove_hp_group; |
501 | } | 591 | } |
502 | 592 | ||
@@ -550,6 +640,8 @@ static int __init hugepage_init(void) | |||
550 | goto out; | 640 | goto out; |
551 | } | 641 | } |
552 | 642 | ||
643 | register_shrinker(&huge_zero_page_shrinker); | ||
644 | |||
553 | /* | 645 | /* |
554 | * By default disable transparent hugepages on smaller systems, | 646 | * By default disable transparent hugepages on smaller systems, |
555 | * where the extra memory used could hurt more than TLB overhead | 647 | * where the extra memory used could hurt more than TLB overhead |
@@ -599,13 +691,22 @@ out: | |||
599 | } | 691 | } |
600 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 692 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
601 | 693 | ||
602 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 694 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
603 | { | 695 | { |
604 | if (likely(vma->vm_flags & VM_WRITE)) | 696 | if (likely(vma->vm_flags & VM_WRITE)) |
605 | pmd = pmd_mkwrite(pmd); | 697 | pmd = pmd_mkwrite(pmd); |
606 | return pmd; | 698 | return pmd; |
607 | } | 699 | } |
608 | 700 | ||
701 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) | ||
702 | { | ||
703 | pmd_t entry; | ||
704 | entry = mk_pmd(page, vma->vm_page_prot); | ||
705 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
706 | entry = pmd_mkhuge(entry); | ||
707 | return entry; | ||
708 | } | ||
709 | |||
609 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 710 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
610 | struct vm_area_struct *vma, | 711 | struct vm_area_struct *vma, |
611 | unsigned long haddr, pmd_t *pmd, | 712 | unsigned long haddr, pmd_t *pmd, |
@@ -629,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
629 | pte_free(mm, pgtable); | 730 | pte_free(mm, pgtable); |
630 | } else { | 731 | } else { |
631 | pmd_t entry; | 732 | pmd_t entry; |
632 | entry = mk_pmd(page, vma->vm_page_prot); | 733 | entry = mk_huge_pmd(page, vma); |
633 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
634 | entry = pmd_mkhuge(entry); | ||
635 | /* | 734 | /* |
636 | * The spinlocking to take the lru_lock inside | 735 | * The spinlocking to take the lru_lock inside |
637 | * page_add_new_anon_rmap() acts as a full memory | 736 | * page_add_new_anon_rmap() acts as a full memory |
@@ -671,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag) | |||
671 | } | 770 | } |
672 | #endif | 771 | #endif |
673 | 772 | ||
773 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | ||
774 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | ||
775 | unsigned long zero_pfn) | ||
776 | { | ||
777 | pmd_t entry; | ||
778 | if (!pmd_none(*pmd)) | ||
779 | return false; | ||
780 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | ||
781 | entry = pmd_wrprotect(entry); | ||
782 | entry = pmd_mkhuge(entry); | ||
783 | set_pmd_at(mm, haddr, pmd, entry); | ||
784 | pgtable_trans_huge_deposit(mm, pgtable); | ||
785 | mm->nr_ptes++; | ||
786 | return true; | ||
787 | } | ||
788 | |||
674 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 789 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
675 | unsigned long address, pmd_t *pmd, | 790 | unsigned long address, pmd_t *pmd, |
676 | unsigned int flags) | 791 | unsigned int flags) |
@@ -684,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
684 | return VM_FAULT_OOM; | 799 | return VM_FAULT_OOM; |
685 | if (unlikely(khugepaged_enter(vma))) | 800 | if (unlikely(khugepaged_enter(vma))) |
686 | return VM_FAULT_OOM; | 801 | return VM_FAULT_OOM; |
802 | if (!(flags & FAULT_FLAG_WRITE) && | ||
803 | transparent_hugepage_use_zero_page()) { | ||
804 | pgtable_t pgtable; | ||
805 | unsigned long zero_pfn; | ||
806 | bool set; | ||
807 | pgtable = pte_alloc_one(mm, haddr); | ||
808 | if (unlikely(!pgtable)) | ||
809 | return VM_FAULT_OOM; | ||
810 | zero_pfn = get_huge_zero_page(); | ||
811 | if (unlikely(!zero_pfn)) { | ||
812 | pte_free(mm, pgtable); | ||
813 | count_vm_event(THP_FAULT_FALLBACK); | ||
814 | goto out; | ||
815 | } | ||
816 | spin_lock(&mm->page_table_lock); | ||
817 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
818 | zero_pfn); | ||
819 | spin_unlock(&mm->page_table_lock); | ||
820 | if (!set) { | ||
821 | pte_free(mm, pgtable); | ||
822 | put_huge_zero_page(); | ||
823 | } | ||
824 | return 0; | ||
825 | } | ||
687 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 826 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
688 | vma, haddr, numa_node_id(), 0); | 827 | vma, haddr, numa_node_id(), 0); |
689 | if (unlikely(!page)) { | 828 | if (unlikely(!page)) { |
@@ -710,7 +849,8 @@ out: | |||
710 | * run pte_offset_map on the pmd, if an huge pmd could | 849 | * run pte_offset_map on the pmd, if an huge pmd could |
711 | * materialize from under us from a different thread. | 850 | * materialize from under us from a different thread. |
712 | */ | 851 | */ |
713 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | 852 | if (unlikely(pmd_none(*pmd)) && |
853 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
714 | return VM_FAULT_OOM; | 854 | return VM_FAULT_OOM; |
715 | /* if an huge pmd materialized from under us just retry later */ | 855 | /* if an huge pmd materialized from under us just retry later */ |
716 | if (unlikely(pmd_trans_huge(*pmd))) | 856 | if (unlikely(pmd_trans_huge(*pmd))) |
@@ -748,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
748 | pte_free(dst_mm, pgtable); | 888 | pte_free(dst_mm, pgtable); |
749 | goto out_unlock; | 889 | goto out_unlock; |
750 | } | 890 | } |
891 | /* | ||
892 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | ||
893 | * under splitting since we don't split the page itself, only pmd to | ||
894 | * a page table. | ||
895 | */ | ||
896 | if (is_huge_zero_pmd(pmd)) { | ||
897 | unsigned long zero_pfn; | ||
898 | bool set; | ||
899 | /* | ||
900 | * get_huge_zero_page() will never allocate a new page here, | ||
901 | * since we already have a zero page to copy. It just takes a | ||
902 | * reference. | ||
903 | */ | ||
904 | zero_pfn = get_huge_zero_page(); | ||
905 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | ||
906 | zero_pfn); | ||
907 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | ||
908 | ret = 0; | ||
909 | goto out_unlock; | ||
910 | } | ||
751 | if (unlikely(pmd_trans_splitting(pmd))) { | 911 | if (unlikely(pmd_trans_splitting(pmd))) { |
752 | /* split huge page running from under us */ | 912 | /* split huge page running from under us */ |
753 | spin_unlock(&src_mm->page_table_lock); | 913 | spin_unlock(&src_mm->page_table_lock); |
@@ -777,6 +937,102 @@ out: | |||
777 | return ret; | 937 | return ret; |
778 | } | 938 | } |
779 | 939 | ||
940 | void huge_pmd_set_accessed(struct mm_struct *mm, | ||
941 | struct vm_area_struct *vma, | ||
942 | unsigned long address, | ||
943 | pmd_t *pmd, pmd_t orig_pmd, | ||
944 | int dirty) | ||
945 | { | ||
946 | pmd_t entry; | ||
947 | unsigned long haddr; | ||
948 | |||
949 | spin_lock(&mm->page_table_lock); | ||
950 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
951 | goto unlock; | ||
952 | |||
953 | entry = pmd_mkyoung(orig_pmd); | ||
954 | haddr = address & HPAGE_PMD_MASK; | ||
955 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) | ||
956 | update_mmu_cache_pmd(vma, address, pmd); | ||
957 | |||
958 | unlock: | ||
959 | spin_unlock(&mm->page_table_lock); | ||
960 | } | ||
961 | |||
962 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | ||
963 | struct vm_area_struct *vma, unsigned long address, | ||
964 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | ||
965 | { | ||
966 | pgtable_t pgtable; | ||
967 | pmd_t _pmd; | ||
968 | struct page *page; | ||
969 | int i, ret = 0; | ||
970 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
971 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
972 | |||
973 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
974 | if (!page) { | ||
975 | ret |= VM_FAULT_OOM; | ||
976 | goto out; | ||
977 | } | ||
978 | |||
979 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
980 | put_page(page); | ||
981 | ret |= VM_FAULT_OOM; | ||
982 | goto out; | ||
983 | } | ||
984 | |||
985 | clear_user_highpage(page, address); | ||
986 | __SetPageUptodate(page); | ||
987 | |||
988 | mmun_start = haddr; | ||
989 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
990 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
991 | |||
992 | spin_lock(&mm->page_table_lock); | ||
993 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
994 | goto out_free_page; | ||
995 | |||
996 | pmdp_clear_flush(vma, haddr, pmd); | ||
997 | /* leave pmd empty until pte is filled */ | ||
998 | |||
999 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
1000 | pmd_populate(mm, &_pmd, pgtable); | ||
1001 | |||
1002 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
1003 | pte_t *pte, entry; | ||
1004 | if (haddr == (address & PAGE_MASK)) { | ||
1005 | entry = mk_pte(page, vma->vm_page_prot); | ||
1006 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1007 | page_add_new_anon_rmap(page, vma, haddr); | ||
1008 | } else { | ||
1009 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
1010 | entry = pte_mkspecial(entry); | ||
1011 | } | ||
1012 | pte = pte_offset_map(&_pmd, haddr); | ||
1013 | VM_BUG_ON(!pte_none(*pte)); | ||
1014 | set_pte_at(mm, haddr, pte, entry); | ||
1015 | pte_unmap(pte); | ||
1016 | } | ||
1017 | smp_wmb(); /* make pte visible before pmd */ | ||
1018 | pmd_populate(mm, pmd, pgtable); | ||
1019 | spin_unlock(&mm->page_table_lock); | ||
1020 | put_huge_zero_page(); | ||
1021 | inc_mm_counter(mm, MM_ANONPAGES); | ||
1022 | |||
1023 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1024 | |||
1025 | ret |= VM_FAULT_WRITE; | ||
1026 | out: | ||
1027 | return ret; | ||
1028 | out_free_page: | ||
1029 | spin_unlock(&mm->page_table_lock); | ||
1030 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1031 | mem_cgroup_uncharge_page(page); | ||
1032 | put_page(page); | ||
1033 | goto out; | ||
1034 | } | ||
1035 | |||
780 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1036 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
781 | struct vm_area_struct *vma, | 1037 | struct vm_area_struct *vma, |
782 | unsigned long address, | 1038 | unsigned long address, |
@@ -883,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
883 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1139 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
884 | { | 1140 | { |
885 | int ret = 0; | 1141 | int ret = 0; |
886 | struct page *page, *new_page; | 1142 | struct page *page = NULL, *new_page; |
887 | unsigned long haddr; | 1143 | unsigned long haddr; |
888 | unsigned long mmun_start; /* For mmu_notifiers */ | 1144 | unsigned long mmun_start; /* For mmu_notifiers */ |
889 | unsigned long mmun_end; /* For mmu_notifiers */ | 1145 | unsigned long mmun_end; /* For mmu_notifiers */ |
890 | 1146 | ||
891 | VM_BUG_ON(!vma->anon_vma); | 1147 | VM_BUG_ON(!vma->anon_vma); |
1148 | haddr = address & HPAGE_PMD_MASK; | ||
1149 | if (is_huge_zero_pmd(orig_pmd)) | ||
1150 | goto alloc; | ||
892 | spin_lock(&mm->page_table_lock); | 1151 | spin_lock(&mm->page_table_lock); |
893 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1152 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
894 | goto out_unlock; | 1153 | goto out_unlock; |
895 | 1154 | ||
896 | page = pmd_page(orig_pmd); | 1155 | page = pmd_page(orig_pmd); |
897 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1156 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); |
898 | haddr = address & HPAGE_PMD_MASK; | ||
899 | if (page_mapcount(page) == 1) { | 1157 | if (page_mapcount(page) == 1) { |
900 | pmd_t entry; | 1158 | pmd_t entry; |
901 | entry = pmd_mkyoung(orig_pmd); | 1159 | entry = pmd_mkyoung(orig_pmd); |
@@ -907,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
907 | } | 1165 | } |
908 | get_page(page); | 1166 | get_page(page); |
909 | spin_unlock(&mm->page_table_lock); | 1167 | spin_unlock(&mm->page_table_lock); |
910 | 1168 | alloc: | |
911 | if (transparent_hugepage_enabled(vma) && | 1169 | if (transparent_hugepage_enabled(vma) && |
912 | !transparent_hugepage_debug_cow()) | 1170 | !transparent_hugepage_debug_cow()) |
913 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1171 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
@@ -917,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
917 | 1175 | ||
918 | if (unlikely(!new_page)) { | 1176 | if (unlikely(!new_page)) { |
919 | count_vm_event(THP_FAULT_FALLBACK); | 1177 | count_vm_event(THP_FAULT_FALLBACK); |
920 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1178 | if (is_huge_zero_pmd(orig_pmd)) { |
921 | pmd, orig_pmd, page, haddr); | 1179 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
922 | if (ret & VM_FAULT_OOM) | 1180 | address, pmd, orig_pmd, haddr); |
923 | split_huge_page(page); | 1181 | } else { |
924 | put_page(page); | 1182 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
1183 | pmd, orig_pmd, page, haddr); | ||
1184 | if (ret & VM_FAULT_OOM) | ||
1185 | split_huge_page(page); | ||
1186 | put_page(page); | ||
1187 | } | ||
925 | goto out; | 1188 | goto out; |
926 | } | 1189 | } |
927 | count_vm_event(THP_FAULT_ALLOC); | 1190 | count_vm_event(THP_FAULT_ALLOC); |
928 | 1191 | ||
929 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1192 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
930 | put_page(new_page); | 1193 | put_page(new_page); |
931 | split_huge_page(page); | 1194 | if (page) { |
932 | put_page(page); | 1195 | split_huge_page(page); |
1196 | put_page(page); | ||
1197 | } | ||
933 | ret |= VM_FAULT_OOM; | 1198 | ret |= VM_FAULT_OOM; |
934 | goto out; | 1199 | goto out; |
935 | } | 1200 | } |
936 | 1201 | ||
937 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1202 | if (is_huge_zero_pmd(orig_pmd)) |
1203 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | ||
1204 | else | ||
1205 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
938 | __SetPageUptodate(new_page); | 1206 | __SetPageUptodate(new_page); |
939 | 1207 | ||
940 | mmun_start = haddr; | 1208 | mmun_start = haddr; |
@@ -942,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
942 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1210 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
943 | 1211 | ||
944 | spin_lock(&mm->page_table_lock); | 1212 | spin_lock(&mm->page_table_lock); |
945 | put_page(page); | 1213 | if (page) |
1214 | put_page(page); | ||
946 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1215 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
947 | spin_unlock(&mm->page_table_lock); | 1216 | spin_unlock(&mm->page_table_lock); |
948 | mem_cgroup_uncharge_page(new_page); | 1217 | mem_cgroup_uncharge_page(new_page); |
@@ -950,16 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
950 | goto out_mn; | 1219 | goto out_mn; |
951 | } else { | 1220 | } else { |
952 | pmd_t entry; | 1221 | pmd_t entry; |
953 | VM_BUG_ON(!PageHead(page)); | 1222 | entry = mk_huge_pmd(new_page, vma); |
954 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
955 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
956 | entry = pmd_mkhuge(entry); | ||
957 | pmdp_clear_flush(vma, haddr, pmd); | 1223 | pmdp_clear_flush(vma, haddr, pmd); |
958 | page_add_new_anon_rmap(new_page, vma, haddr); | 1224 | page_add_new_anon_rmap(new_page, vma, haddr); |
959 | set_pmd_at(mm, haddr, pmd, entry); | 1225 | set_pmd_at(mm, haddr, pmd, entry); |
960 | update_mmu_cache_pmd(vma, address, pmd); | 1226 | update_mmu_cache_pmd(vma, address, pmd); |
961 | page_remove_rmap(page); | 1227 | if (is_huge_zero_pmd(orig_pmd)) { |
962 | put_page(page); | 1228 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1229 | put_huge_zero_page(); | ||
1230 | } else { | ||
1231 | VM_BUG_ON(!PageHead(page)); | ||
1232 | page_remove_rmap(page); | ||
1233 | put_page(page); | ||
1234 | } | ||
963 | ret |= VM_FAULT_WRITE; | 1235 | ret |= VM_FAULT_WRITE; |
964 | } | 1236 | } |
965 | spin_unlock(&mm->page_table_lock); | 1237 | spin_unlock(&mm->page_table_lock); |
@@ -1017,6 +1289,81 @@ out: | |||
1017 | return page; | 1289 | return page; |
1018 | } | 1290 | } |
1019 | 1291 | ||
1292 | /* NUMA hinting page fault entry point for trans huge pmds */ | ||
1293 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1294 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
1295 | { | ||
1296 | struct page *page; | ||
1297 | unsigned long haddr = addr & HPAGE_PMD_MASK; | ||
1298 | int target_nid; | ||
1299 | int current_nid = -1; | ||
1300 | bool migrated; | ||
1301 | bool page_locked = false; | ||
1302 | |||
1303 | spin_lock(&mm->page_table_lock); | ||
1304 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1305 | goto out_unlock; | ||
1306 | |||
1307 | page = pmd_page(pmd); | ||
1308 | get_page(page); | ||
1309 | current_nid = page_to_nid(page); | ||
1310 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
1311 | if (current_nid == numa_node_id()) | ||
1312 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
1313 | |||
1314 | target_nid = mpol_misplaced(page, vma, haddr); | ||
1315 | if (target_nid == -1) { | ||
1316 | put_page(page); | ||
1317 | goto clear_pmdnuma; | ||
1318 | } | ||
1319 | |||
1320 | /* Acquire the page lock to serialise THP migrations */ | ||
1321 | spin_unlock(&mm->page_table_lock); | ||
1322 | lock_page(page); | ||
1323 | page_locked = true; | ||
1324 | |||
1325 | /* Confirm the PTE did not while locked */ | ||
1326 | spin_lock(&mm->page_table_lock); | ||
1327 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1328 | unlock_page(page); | ||
1329 | put_page(page); | ||
1330 | goto out_unlock; | ||
1331 | } | ||
1332 | spin_unlock(&mm->page_table_lock); | ||
1333 | |||
1334 | /* Migrate the THP to the requested node */ | ||
1335 | migrated = migrate_misplaced_transhuge_page(mm, vma, | ||
1336 | pmdp, pmd, addr, | ||
1337 | page, target_nid); | ||
1338 | if (migrated) | ||
1339 | current_nid = target_nid; | ||
1340 | else { | ||
1341 | spin_lock(&mm->page_table_lock); | ||
1342 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1343 | unlock_page(page); | ||
1344 | goto out_unlock; | ||
1345 | } | ||
1346 | goto clear_pmdnuma; | ||
1347 | } | ||
1348 | |||
1349 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1350 | return 0; | ||
1351 | |||
1352 | clear_pmdnuma: | ||
1353 | pmd = pmd_mknonnuma(pmd); | ||
1354 | set_pmd_at(mm, haddr, pmdp, pmd); | ||
1355 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
1356 | update_mmu_cache_pmd(vma, addr, pmdp); | ||
1357 | if (page_locked) | ||
1358 | unlock_page(page); | ||
1359 | |||
1360 | out_unlock: | ||
1361 | spin_unlock(&mm->page_table_lock); | ||
1362 | if (current_nid != -1) | ||
1363 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1364 | return 0; | ||
1365 | } | ||
1366 | |||
1020 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1367 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1021 | pmd_t *pmd, unsigned long addr) | 1368 | pmd_t *pmd, unsigned long addr) |
1022 | { | 1369 | { |
@@ -1028,15 +1375,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1028 | pmd_t orig_pmd; | 1375 | pmd_t orig_pmd; |
1029 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); | 1376 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1030 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1377 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1031 | page = pmd_page(orig_pmd); | ||
1032 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1378 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1033 | page_remove_rmap(page); | 1379 | if (is_huge_zero_pmd(orig_pmd)) { |
1034 | VM_BUG_ON(page_mapcount(page) < 0); | 1380 | tlb->mm->nr_ptes--; |
1035 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1381 | spin_unlock(&tlb->mm->page_table_lock); |
1036 | VM_BUG_ON(!PageHead(page)); | 1382 | put_huge_zero_page(); |
1037 | tlb->mm->nr_ptes--; | 1383 | } else { |
1038 | spin_unlock(&tlb->mm->page_table_lock); | 1384 | page = pmd_page(orig_pmd); |
1039 | tlb_remove_page(tlb, page); | 1385 | page_remove_rmap(page); |
1386 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1387 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1388 | VM_BUG_ON(!PageHead(page)); | ||
1389 | tlb->mm->nr_ptes--; | ||
1390 | spin_unlock(&tlb->mm->page_table_lock); | ||
1391 | tlb_remove_page(tlb, page); | ||
1392 | } | ||
1040 | pte_free(tlb->mm, pgtable); | 1393 | pte_free(tlb->mm, pgtable); |
1041 | ret = 1; | 1394 | ret = 1; |
1042 | } | 1395 | } |
@@ -1099,7 +1452,7 @@ out: | |||
1099 | } | 1452 | } |
1100 | 1453 | ||
1101 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1454 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1102 | unsigned long addr, pgprot_t newprot) | 1455 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1103 | { | 1456 | { |
1104 | struct mm_struct *mm = vma->vm_mm; | 1457 | struct mm_struct *mm = vma->vm_mm; |
1105 | int ret = 0; | 1458 | int ret = 0; |
@@ -1107,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1107 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1460 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1108 | pmd_t entry; | 1461 | pmd_t entry; |
1109 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1462 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1110 | entry = pmd_modify(entry, newprot); | 1463 | if (!prot_numa) { |
1464 | entry = pmd_modify(entry, newprot); | ||
1465 | BUG_ON(pmd_write(entry)); | ||
1466 | } else { | ||
1467 | struct page *page = pmd_page(*pmd); | ||
1468 | |||
1469 | /* only check non-shared pages */ | ||
1470 | if (page_mapcount(page) == 1 && | ||
1471 | !pmd_numa(*pmd)) { | ||
1472 | entry = pmd_mknuma(entry); | ||
1473 | } | ||
1474 | } | ||
1111 | set_pmd_at(mm, addr, pmd, entry); | 1475 | set_pmd_at(mm, addr, pmd, entry); |
1112 | spin_unlock(&vma->vm_mm->page_table_lock); | 1476 | spin_unlock(&vma->vm_mm->page_table_lock); |
1113 | ret = 1; | 1477 | ret = 1; |
@@ -1146,22 +1510,14 @@ pmd_t *page_check_address_pmd(struct page *page, | |||
1146 | unsigned long address, | 1510 | unsigned long address, |
1147 | enum page_check_address_pmd_flag flag) | 1511 | enum page_check_address_pmd_flag flag) |
1148 | { | 1512 | { |
1149 | pgd_t *pgd; | ||
1150 | pud_t *pud; | ||
1151 | pmd_t *pmd, *ret = NULL; | 1513 | pmd_t *pmd, *ret = NULL; |
1152 | 1514 | ||
1153 | if (address & ~HPAGE_PMD_MASK) | 1515 | if (address & ~HPAGE_PMD_MASK) |
1154 | goto out; | 1516 | goto out; |
1155 | 1517 | ||
1156 | pgd = pgd_offset(mm, address); | 1518 | pmd = mm_find_pmd(mm, address); |
1157 | if (!pgd_present(*pgd)) | 1519 | if (!pmd) |
1158 | goto out; | 1520 | goto out; |
1159 | |||
1160 | pud = pud_offset(pgd, address); | ||
1161 | if (!pud_present(*pud)) | ||
1162 | goto out; | ||
1163 | |||
1164 | pmd = pmd_offset(pud, address); | ||
1165 | if (pmd_none(*pmd)) | 1521 | if (pmd_none(*pmd)) |
1166 | goto out; | 1522 | goto out; |
1167 | if (pmd_page(*pmd) != page) | 1523 | if (pmd_page(*pmd) != page) |
@@ -1205,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page, | |||
1205 | * We can't temporarily set the pmd to null in order | 1561 | * We can't temporarily set the pmd to null in order |
1206 | * to split it, the pmd must remain marked huge at all | 1562 | * to split it, the pmd must remain marked huge at all |
1207 | * times or the VM won't take the pmd_trans_huge paths | 1563 | * times or the VM won't take the pmd_trans_huge paths |
1208 | * and it won't wait on the anon_vma->root->mutex to | 1564 | * and it won't wait on the anon_vma->root->rwsem to |
1209 | * serialize against split_huge_page*. | 1565 | * serialize against split_huge_page*. |
1210 | */ | 1566 | */ |
1211 | pmdp_splitting_flush(vma, address, pmd); | 1567 | pmdp_splitting_flush(vma, address, pmd); |
@@ -1296,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1296 | page_tail->mapping = page->mapping; | 1652 | page_tail->mapping = page->mapping; |
1297 | 1653 | ||
1298 | page_tail->index = page->index + i; | 1654 | page_tail->index = page->index + i; |
1655 | page_xchg_last_nid(page_tail, page_last_nid(page)); | ||
1299 | 1656 | ||
1300 | BUG_ON(!PageAnon(page_tail)); | 1657 | BUG_ON(!PageAnon(page_tail)); |
1301 | BUG_ON(!PageUptodate(page_tail)); | 1658 | BUG_ON(!PageUptodate(page_tail)); |
@@ -1363,6 +1720,8 @@ static int __split_huge_page_map(struct page *page, | |||
1363 | BUG_ON(page_mapcount(page) != 1); | 1720 | BUG_ON(page_mapcount(page) != 1); |
1364 | if (!pmd_young(*pmd)) | 1721 | if (!pmd_young(*pmd)) |
1365 | entry = pte_mkold(entry); | 1722 | entry = pte_mkold(entry); |
1723 | if (pmd_numa(*pmd)) | ||
1724 | entry = pte_mknuma(entry); | ||
1366 | pte = pte_offset_map(&_pmd, haddr); | 1725 | pte = pte_offset_map(&_pmd, haddr); |
1367 | BUG_ON(!pte_none(*pte)); | 1726 | BUG_ON(!pte_none(*pte)); |
1368 | set_pte_at(mm, haddr, pte, entry); | 1727 | set_pte_at(mm, haddr, pte, entry); |
@@ -1405,7 +1764,7 @@ static int __split_huge_page_map(struct page *page, | |||
1405 | return ret; | 1764 | return ret; |
1406 | } | 1765 | } |
1407 | 1766 | ||
1408 | /* must be called with anon_vma->root->mutex hold */ | 1767 | /* must be called with anon_vma->root->rwsem held */ |
1409 | static void __split_huge_page(struct page *page, | 1768 | static void __split_huge_page(struct page *page, |
1410 | struct anon_vma *anon_vma) | 1769 | struct anon_vma *anon_vma) |
1411 | { | 1770 | { |
@@ -1458,10 +1817,21 @@ int split_huge_page(struct page *page) | |||
1458 | struct anon_vma *anon_vma; | 1817 | struct anon_vma *anon_vma; |
1459 | int ret = 1; | 1818 | int ret = 1; |
1460 | 1819 | ||
1820 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | ||
1461 | BUG_ON(!PageAnon(page)); | 1821 | BUG_ON(!PageAnon(page)); |
1462 | anon_vma = page_lock_anon_vma(page); | 1822 | |
1823 | /* | ||
1824 | * The caller does not necessarily hold an mmap_sem that would prevent | ||
1825 | * the anon_vma disappearing so we first we take a reference to it | ||
1826 | * and then lock the anon_vma for write. This is similar to | ||
1827 | * page_lock_anon_vma_read except the write lock is taken to serialise | ||
1828 | * against parallel split or collapse operations. | ||
1829 | */ | ||
1830 | anon_vma = page_get_anon_vma(page); | ||
1463 | if (!anon_vma) | 1831 | if (!anon_vma) |
1464 | goto out; | 1832 | goto out; |
1833 | anon_vma_lock_write(anon_vma); | ||
1834 | |||
1465 | ret = 0; | 1835 | ret = 0; |
1466 | if (!PageCompound(page)) | 1836 | if (!PageCompound(page)) |
1467 | goto out_unlock; | 1837 | goto out_unlock; |
@@ -1472,7 +1842,8 @@ int split_huge_page(struct page *page) | |||
1472 | 1842 | ||
1473 | BUG_ON(PageCompound(page)); | 1843 | BUG_ON(PageCompound(page)); |
1474 | out_unlock: | 1844 | out_unlock: |
1475 | page_unlock_anon_vma(anon_vma); | 1845 | anon_vma_unlock(anon_vma); |
1846 | put_anon_vma(anon_vma); | ||
1476 | out: | 1847 | out: |
1477 | return ret; | 1848 | return ret; |
1478 | } | 1849 | } |
@@ -1701,64 +2072,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) | |||
1701 | } | 2072 | } |
1702 | } | 2073 | } |
1703 | 2074 | ||
1704 | static void release_all_pte_pages(pte_t *pte) | ||
1705 | { | ||
1706 | release_pte_pages(pte, pte + HPAGE_PMD_NR); | ||
1707 | } | ||
1708 | |||
1709 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | 2075 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, |
1710 | unsigned long address, | 2076 | unsigned long address, |
1711 | pte_t *pte) | 2077 | pte_t *pte) |
1712 | { | 2078 | { |
1713 | struct page *page; | 2079 | struct page *page; |
1714 | pte_t *_pte; | 2080 | pte_t *_pte; |
1715 | int referenced = 0, isolated = 0, none = 0; | 2081 | int referenced = 0, none = 0; |
1716 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2082 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
1717 | _pte++, address += PAGE_SIZE) { | 2083 | _pte++, address += PAGE_SIZE) { |
1718 | pte_t pteval = *_pte; | 2084 | pte_t pteval = *_pte; |
1719 | if (pte_none(pteval)) { | 2085 | if (pte_none(pteval)) { |
1720 | if (++none <= khugepaged_max_ptes_none) | 2086 | if (++none <= khugepaged_max_ptes_none) |
1721 | continue; | 2087 | continue; |
1722 | else { | 2088 | else |
1723 | release_pte_pages(pte, _pte); | ||
1724 | goto out; | 2089 | goto out; |
1725 | } | ||
1726 | } | 2090 | } |
1727 | if (!pte_present(pteval) || !pte_write(pteval)) { | 2091 | if (!pte_present(pteval) || !pte_write(pteval)) |
1728 | release_pte_pages(pte, _pte); | ||
1729 | goto out; | 2092 | goto out; |
1730 | } | ||
1731 | page = vm_normal_page(vma, address, pteval); | 2093 | page = vm_normal_page(vma, address, pteval); |
1732 | if (unlikely(!page)) { | 2094 | if (unlikely(!page)) |
1733 | release_pte_pages(pte, _pte); | ||
1734 | goto out; | 2095 | goto out; |
1735 | } | 2096 | |
1736 | VM_BUG_ON(PageCompound(page)); | 2097 | VM_BUG_ON(PageCompound(page)); |
1737 | BUG_ON(!PageAnon(page)); | 2098 | BUG_ON(!PageAnon(page)); |
1738 | VM_BUG_ON(!PageSwapBacked(page)); | 2099 | VM_BUG_ON(!PageSwapBacked(page)); |
1739 | 2100 | ||
1740 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2101 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
1741 | if (page_count(page) != 1) { | 2102 | if (page_count(page) != 1) |
1742 | release_pte_pages(pte, _pte); | ||
1743 | goto out; | 2103 | goto out; |
1744 | } | ||
1745 | /* | 2104 | /* |
1746 | * We can do it before isolate_lru_page because the | 2105 | * We can do it before isolate_lru_page because the |
1747 | * page can't be freed from under us. NOTE: PG_lock | 2106 | * page can't be freed from under us. NOTE: PG_lock |
1748 | * is needed to serialize against split_huge_page | 2107 | * is needed to serialize against split_huge_page |
1749 | * when invoked from the VM. | 2108 | * when invoked from the VM. |
1750 | */ | 2109 | */ |
1751 | if (!trylock_page(page)) { | 2110 | if (!trylock_page(page)) |
1752 | release_pte_pages(pte, _pte); | ||
1753 | goto out; | 2111 | goto out; |
1754 | } | ||
1755 | /* | 2112 | /* |
1756 | * Isolate the page to avoid collapsing an hugepage | 2113 | * Isolate the page to avoid collapsing an hugepage |
1757 | * currently in use by the VM. | 2114 | * currently in use by the VM. |
1758 | */ | 2115 | */ |
1759 | if (isolate_lru_page(page)) { | 2116 | if (isolate_lru_page(page)) { |
1760 | unlock_page(page); | 2117 | unlock_page(page); |
1761 | release_pte_pages(pte, _pte); | ||
1762 | goto out; | 2118 | goto out; |
1763 | } | 2119 | } |
1764 | /* 0 stands for page_is_file_cache(page) == false */ | 2120 | /* 0 stands for page_is_file_cache(page) == false */ |
@@ -1771,12 +2127,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
1771 | mmu_notifier_test_young(vma->vm_mm, address)) | 2127 | mmu_notifier_test_young(vma->vm_mm, address)) |
1772 | referenced = 1; | 2128 | referenced = 1; |
1773 | } | 2129 | } |
1774 | if (unlikely(!referenced)) | 2130 | if (likely(referenced)) |
1775 | release_all_pte_pages(pte); | 2131 | return 1; |
1776 | else | ||
1777 | isolated = 1; | ||
1778 | out: | 2132 | out: |
1779 | return isolated; | 2133 | release_pte_pages(pte, _pte); |
2134 | return 0; | ||
1780 | } | 2135 | } |
1781 | 2136 | ||
1782 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | 2137 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, |
@@ -1918,14 +2273,26 @@ static struct page | |||
1918 | } | 2273 | } |
1919 | #endif | 2274 | #endif |
1920 | 2275 | ||
2276 | static bool hugepage_vma_check(struct vm_area_struct *vma) | ||
2277 | { | ||
2278 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
2279 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
2280 | return false; | ||
2281 | |||
2282 | if (!vma->anon_vma || vma->vm_ops) | ||
2283 | return false; | ||
2284 | if (is_vma_temporary_stack(vma)) | ||
2285 | return false; | ||
2286 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | ||
2287 | return true; | ||
2288 | } | ||
2289 | |||
1921 | static void collapse_huge_page(struct mm_struct *mm, | 2290 | static void collapse_huge_page(struct mm_struct *mm, |
1922 | unsigned long address, | 2291 | unsigned long address, |
1923 | struct page **hpage, | 2292 | struct page **hpage, |
1924 | struct vm_area_struct *vma, | 2293 | struct vm_area_struct *vma, |
1925 | int node) | 2294 | int node) |
1926 | { | 2295 | { |
1927 | pgd_t *pgd; | ||
1928 | pud_t *pud; | ||
1929 | pmd_t *pmd, _pmd; | 2296 | pmd_t *pmd, _pmd; |
1930 | pte_t *pte; | 2297 | pte_t *pte; |
1931 | pgtable_t pgtable; | 2298 | pgtable_t pgtable; |
@@ -1960,31 +2327,15 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1960 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2327 | hend = vma->vm_end & HPAGE_PMD_MASK; |
1961 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | 2328 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) |
1962 | goto out; | 2329 | goto out; |
1963 | 2330 | if (!hugepage_vma_check(vma)) | |
1964 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
1965 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
1966 | goto out; | ||
1967 | |||
1968 | if (!vma->anon_vma || vma->vm_ops) | ||
1969 | goto out; | ||
1970 | if (is_vma_temporary_stack(vma)) | ||
1971 | goto out; | 2331 | goto out; |
1972 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2332 | pmd = mm_find_pmd(mm, address); |
1973 | 2333 | if (!pmd) | |
1974 | pgd = pgd_offset(mm, address); | ||
1975 | if (!pgd_present(*pgd)) | ||
1976 | goto out; | ||
1977 | |||
1978 | pud = pud_offset(pgd, address); | ||
1979 | if (!pud_present(*pud)) | ||
1980 | goto out; | 2334 | goto out; |
1981 | 2335 | if (pmd_trans_huge(*pmd)) | |
1982 | pmd = pmd_offset(pud, address); | ||
1983 | /* pmd can't go away or become huge under us */ | ||
1984 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1985 | goto out; | 2336 | goto out; |
1986 | 2337 | ||
1987 | anon_vma_lock(vma->anon_vma); | 2338 | anon_vma_lock_write(vma->anon_vma); |
1988 | 2339 | ||
1989 | pte = pte_offset_map(pmd, address); | 2340 | pte = pte_offset_map(pmd, address); |
1990 | ptl = pte_lockptr(mm, pmd); | 2341 | ptl = pte_lockptr(mm, pmd); |
@@ -2028,9 +2379,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2028 | __SetPageUptodate(new_page); | 2379 | __SetPageUptodate(new_page); |
2029 | pgtable = pmd_pgtable(_pmd); | 2380 | pgtable = pmd_pgtable(_pmd); |
2030 | 2381 | ||
2031 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | 2382 | _pmd = mk_huge_pmd(new_page, vma); |
2032 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
2033 | _pmd = pmd_mkhuge(_pmd); | ||
2034 | 2383 | ||
2035 | /* | 2384 | /* |
2036 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2385 | * spin_lock() below is not the equivalent of smp_wmb(), so |
@@ -2064,8 +2413,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2064 | unsigned long address, | 2413 | unsigned long address, |
2065 | struct page **hpage) | 2414 | struct page **hpage) |
2066 | { | 2415 | { |
2067 | pgd_t *pgd; | ||
2068 | pud_t *pud; | ||
2069 | pmd_t *pmd; | 2416 | pmd_t *pmd; |
2070 | pte_t *pte, *_pte; | 2417 | pte_t *pte, *_pte; |
2071 | int ret = 0, referenced = 0, none = 0; | 2418 | int ret = 0, referenced = 0, none = 0; |
@@ -2076,16 +2423,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2076 | 2423 | ||
2077 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2424 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2078 | 2425 | ||
2079 | pgd = pgd_offset(mm, address); | 2426 | pmd = mm_find_pmd(mm, address); |
2080 | if (!pgd_present(*pgd)) | 2427 | if (!pmd) |
2081 | goto out; | 2428 | goto out; |
2082 | 2429 | if (pmd_trans_huge(*pmd)) | |
2083 | pud = pud_offset(pgd, address); | ||
2084 | if (!pud_present(*pud)) | ||
2085 | goto out; | ||
2086 | |||
2087 | pmd = pmd_offset(pud, address); | ||
2088 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
2089 | goto out; | 2430 | goto out; |
2090 | 2431 | ||
2091 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2432 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
@@ -2193,20 +2534,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2193 | progress++; | 2534 | progress++; |
2194 | break; | 2535 | break; |
2195 | } | 2536 | } |
2196 | 2537 | if (!hugepage_vma_check(vma)) { | |
2197 | if ((!(vma->vm_flags & VM_HUGEPAGE) && | 2538 | skip: |
2198 | !khugepaged_always()) || | ||
2199 | (vma->vm_flags & VM_NOHUGEPAGE)) { | ||
2200 | skip: | ||
2201 | progress++; | 2539 | progress++; |
2202 | continue; | 2540 | continue; |
2203 | } | 2541 | } |
2204 | if (!vma->anon_vma || vma->vm_ops) | ||
2205 | goto skip; | ||
2206 | if (is_vma_temporary_stack(vma)) | ||
2207 | goto skip; | ||
2208 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | ||
2209 | |||
2210 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2542 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2211 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2543 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2212 | if (hstart >= hend) | 2544 | if (hstart >= hend) |
@@ -2356,19 +2688,65 @@ static int khugepaged(void *none) | |||
2356 | return 0; | 2688 | return 0; |
2357 | } | 2689 | } |
2358 | 2690 | ||
2359 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | 2691 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
2692 | unsigned long haddr, pmd_t *pmd) | ||
2693 | { | ||
2694 | struct mm_struct *mm = vma->vm_mm; | ||
2695 | pgtable_t pgtable; | ||
2696 | pmd_t _pmd; | ||
2697 | int i; | ||
2698 | |||
2699 | pmdp_clear_flush(vma, haddr, pmd); | ||
2700 | /* leave pmd empty until pte is filled */ | ||
2701 | |||
2702 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
2703 | pmd_populate(mm, &_pmd, pgtable); | ||
2704 | |||
2705 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
2706 | pte_t *pte, entry; | ||
2707 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
2708 | entry = pte_mkspecial(entry); | ||
2709 | pte = pte_offset_map(&_pmd, haddr); | ||
2710 | VM_BUG_ON(!pte_none(*pte)); | ||
2711 | set_pte_at(mm, haddr, pte, entry); | ||
2712 | pte_unmap(pte); | ||
2713 | } | ||
2714 | smp_wmb(); /* make pte visible before pmd */ | ||
2715 | pmd_populate(mm, pmd, pgtable); | ||
2716 | put_huge_zero_page(); | ||
2717 | } | ||
2718 | |||
2719 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | ||
2720 | pmd_t *pmd) | ||
2360 | { | 2721 | { |
2361 | struct page *page; | 2722 | struct page *page; |
2723 | struct mm_struct *mm = vma->vm_mm; | ||
2724 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
2725 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2726 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2727 | |||
2728 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | ||
2362 | 2729 | ||
2730 | mmun_start = haddr; | ||
2731 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
2732 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2363 | spin_lock(&mm->page_table_lock); | 2733 | spin_lock(&mm->page_table_lock); |
2364 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2734 | if (unlikely(!pmd_trans_huge(*pmd))) { |
2365 | spin_unlock(&mm->page_table_lock); | 2735 | spin_unlock(&mm->page_table_lock); |
2736 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2737 | return; | ||
2738 | } | ||
2739 | if (is_huge_zero_pmd(*pmd)) { | ||
2740 | __split_huge_zero_page_pmd(vma, haddr, pmd); | ||
2741 | spin_unlock(&mm->page_table_lock); | ||
2742 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2366 | return; | 2743 | return; |
2367 | } | 2744 | } |
2368 | page = pmd_page(*pmd); | 2745 | page = pmd_page(*pmd); |
2369 | VM_BUG_ON(!page_count(page)); | 2746 | VM_BUG_ON(!page_count(page)); |
2370 | get_page(page); | 2747 | get_page(page); |
2371 | spin_unlock(&mm->page_table_lock); | 2748 | spin_unlock(&mm->page_table_lock); |
2749 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2372 | 2750 | ||
2373 | split_huge_page(page); | 2751 | split_huge_page(page); |
2374 | 2752 | ||
@@ -2376,31 +2754,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | |||
2376 | BUG_ON(pmd_trans_huge(*pmd)); | 2754 | BUG_ON(pmd_trans_huge(*pmd)); |
2377 | } | 2755 | } |
2378 | 2756 | ||
2757 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
2758 | pmd_t *pmd) | ||
2759 | { | ||
2760 | struct vm_area_struct *vma; | ||
2761 | |||
2762 | vma = find_vma(mm, address); | ||
2763 | BUG_ON(vma == NULL); | ||
2764 | split_huge_page_pmd(vma, address, pmd); | ||
2765 | } | ||
2766 | |||
2379 | static void split_huge_page_address(struct mm_struct *mm, | 2767 | static void split_huge_page_address(struct mm_struct *mm, |
2380 | unsigned long address) | 2768 | unsigned long address) |
2381 | { | 2769 | { |
2382 | pgd_t *pgd; | ||
2383 | pud_t *pud; | ||
2384 | pmd_t *pmd; | 2770 | pmd_t *pmd; |
2385 | 2771 | ||
2386 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | 2772 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); |
2387 | 2773 | ||
2388 | pgd = pgd_offset(mm, address); | 2774 | pmd = mm_find_pmd(mm, address); |
2389 | if (!pgd_present(*pgd)) | 2775 | if (!pmd) |
2390 | return; | ||
2391 | |||
2392 | pud = pud_offset(pgd, address); | ||
2393 | if (!pud_present(*pud)) | ||
2394 | return; | ||
2395 | |||
2396 | pmd = pmd_offset(pud, address); | ||
2397 | if (!pmd_present(*pmd)) | ||
2398 | return; | 2776 | return; |
2399 | /* | 2777 | /* |
2400 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2778 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
2401 | * materialize from under us. | 2779 | * materialize from under us. |
2402 | */ | 2780 | */ |
2403 | split_huge_page_pmd(mm, pmd); | 2781 | split_huge_page_pmd_mm(mm, address, pmd); |
2404 | } | 2782 | } |
2405 | 2783 | ||
2406 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 2784 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59a0059b39e2..4f3ea0b1e57c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Generic hugetlb support. | 2 | * Generic hugetlb support. |
3 | * (C) William Irwin, April 2004 | 3 | * (C) Nadia Yvette Chambers, April 2004 |
4 | */ | 4 | */ |
5 | #include <linux/list.h> | 5 | #include <linux/list.h> |
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
1057 | * on-line nodes with memory and will handle the hstate accounting. | 1057 | * on-line nodes with memory and will handle the hstate accounting. |
1058 | */ | 1058 | */ |
1059 | while (nr_pages--) { | 1059 | while (nr_pages--) { |
1060 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) | 1060 | if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) |
1061 | break; | 1061 | break; |
1062 | } | 1062 | } |
1063 | } | 1063 | } |
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1181 | { | 1181 | { |
1182 | struct huge_bootmem_page *m; | 1182 | struct huge_bootmem_page *m; |
1183 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 1183 | int nr_nodes = nodes_weight(node_states[N_MEMORY]); |
1184 | 1184 | ||
1185 | while (nr_nodes) { | 1185 | while (nr_nodes) { |
1186 | void *addr; | 1186 | void *addr; |
1187 | 1187 | ||
1188 | addr = __alloc_bootmem_node_nopanic( | 1188 | addr = __alloc_bootmem_node_nopanic( |
1189 | NODE_DATA(hstate_next_node_to_alloc(h, | 1189 | NODE_DATA(hstate_next_node_to_alloc(h, |
1190 | &node_states[N_HIGH_MEMORY])), | 1190 | &node_states[N_MEMORY])), |
1191 | huge_page_size(h), huge_page_size(h), 0); | 1191 | huge_page_size(h), huge_page_size(h), 0); |
1192 | 1192 | ||
1193 | if (addr) { | 1193 | if (addr) { |
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1259 | if (!alloc_bootmem_huge_page(h)) | 1259 | if (!alloc_bootmem_huge_page(h)) |
1260 | break; | 1260 | break; |
1261 | } else if (!alloc_fresh_huge_page(h, | 1261 | } else if (!alloc_fresh_huge_page(h, |
1262 | &node_states[N_HIGH_MEMORY])) | 1262 | &node_states[N_MEMORY])) |
1263 | break; | 1263 | break; |
1264 | } | 1264 | } |
1265 | h->max_huge_pages = i; | 1265 | h->max_huge_pages = i; |
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1527 | if (!(obey_mempolicy && | 1527 | if (!(obey_mempolicy && |
1528 | init_nodemask_of_mempolicy(nodes_allowed))) { | 1528 | init_nodemask_of_mempolicy(nodes_allowed))) { |
1529 | NODEMASK_FREE(nodes_allowed); | 1529 | NODEMASK_FREE(nodes_allowed); |
1530 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1530 | nodes_allowed = &node_states[N_MEMORY]; |
1531 | } | 1531 | } |
1532 | } else if (nodes_allowed) { | 1532 | } else if (nodes_allowed) { |
1533 | /* | 1533 | /* |
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | 1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; |
1538 | init_nodemask_of_node(nodes_allowed, nid); | 1538 | init_nodemask_of_node(nodes_allowed, nid); |
1539 | } else | 1539 | } else |
1540 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1540 | nodes_allowed = &node_states[N_MEMORY]; |
1541 | 1541 | ||
1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | 1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); |
1543 | 1543 | ||
1544 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1544 | if (nodes_allowed != &node_states[N_MEMORY]) |
1545 | NODEMASK_FREE(nodes_allowed); | 1545 | NODEMASK_FREE(nodes_allowed); |
1546 | 1546 | ||
1547 | return len; | 1547 | return len; |
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void) | |||
1800 | * remove hstate attributes from any nodes that have them. | 1800 | * remove hstate attributes from any nodes that have them. |
1801 | */ | 1801 | */ |
1802 | for (nid = 0; nid < nr_node_ids; nid++) | 1802 | for (nid = 0; nid < nr_node_ids; nid++) |
1803 | hugetlb_unregister_node(&node_devices[nid]); | 1803 | hugetlb_unregister_node(node_devices[nid]); |
1804 | } | 1804 | } |
1805 | 1805 | ||
1806 | /* | 1806 | /* |
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void) | |||
1844 | { | 1844 | { |
1845 | int nid; | 1845 | int nid; |
1846 | 1846 | ||
1847 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1847 | for_each_node_state(nid, N_MEMORY) { |
1848 | struct node *node = &node_devices[nid]; | 1848 | struct node *node = node_devices[nid]; |
1849 | if (node->dev.id == nid) | 1849 | if (node->dev.id == nid) |
1850 | hugetlb_register_node(node); | 1850 | hugetlb_register_node(node); |
1851 | } | 1851 | } |
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void) | |||
1906 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | 1906 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
1907 | 1907 | ||
1908 | hugetlb_init_hstates(); | 1908 | hugetlb_init_hstates(); |
1909 | |||
1910 | gather_bootmem_prealloc(); | 1909 | gather_bootmem_prealloc(); |
1911 | |||
1912 | report_hugepages(); | 1910 | report_hugepages(); |
1913 | 1911 | ||
1914 | hugetlb_sysfs_init(); | 1912 | hugetlb_sysfs_init(); |
1915 | |||
1916 | hugetlb_register_all_nodes(); | 1913 | hugetlb_register_all_nodes(); |
1914 | hugetlb_cgroup_file_init(); | ||
1917 | 1915 | ||
1918 | return 0; | 1916 | return 0; |
1919 | } | 1917 | } |
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1939 | for (i = 0; i < MAX_NUMNODES; ++i) | 1937 | for (i = 0; i < MAX_NUMNODES; ++i) |
1940 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1938 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1941 | INIT_LIST_HEAD(&h->hugepage_activelist); | 1939 | INIT_LIST_HEAD(&h->hugepage_activelist); |
1942 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1940 | h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); |
1943 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1941 | h->next_nid_to_free = first_node(node_states[N_MEMORY]); |
1944 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1942 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1945 | huge_page_size(h)/1024); | 1943 | huge_page_size(h)/1024); |
1946 | /* | ||
1947 | * Add cgroup control files only if the huge page consists | ||
1948 | * of more than two normal pages. This is because we use | ||
1949 | * page[2].lru.next for storing cgoup details. | ||
1950 | */ | ||
1951 | if (order >= HUGETLB_CGROUP_MIN_ORDER) | ||
1952 | hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); | ||
1953 | 1944 | ||
1954 | parsed_hstate = h; | 1945 | parsed_hstate = h; |
1955 | } | 1946 | } |
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
2035 | if (!(obey_mempolicy && | 2026 | if (!(obey_mempolicy && |
2036 | init_nodemask_of_mempolicy(nodes_allowed))) { | 2027 | init_nodemask_of_mempolicy(nodes_allowed))) { |
2037 | NODEMASK_FREE(nodes_allowed); | 2028 | NODEMASK_FREE(nodes_allowed); |
2038 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 2029 | nodes_allowed = &node_states[N_MEMORY]; |
2039 | } | 2030 | } |
2040 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | 2031 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); |
2041 | 2032 | ||
2042 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 2033 | if (nodes_allowed != &node_states[N_MEMORY]) |
2043 | NODEMASK_FREE(nodes_allowed); | 2034 | NODEMASK_FREE(nodes_allowed); |
2044 | } | 2035 | } |
2045 | out: | 2036 | out: |
@@ -2386,8 +2377,10 @@ again: | |||
2386 | /* | 2377 | /* |
2387 | * HWPoisoned hugepage is already unmapped and dropped reference | 2378 | * HWPoisoned hugepage is already unmapped and dropped reference |
2388 | */ | 2379 | */ |
2389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) | 2380 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { |
2381 | pte_clear(mm, address, ptep); | ||
2390 | continue; | 2382 | continue; |
2383 | } | ||
2391 | 2384 | ||
2392 | page = pte_page(pte); | 2385 | page = pte_page(pte); |
2393 | /* | 2386 | /* |
@@ -3014,7 +3007,7 @@ same_page: | |||
3014 | return i ? i : -EFAULT; | 3007 | return i ? i : -EFAULT; |
3015 | } | 3008 | } |
3016 | 3009 | ||
3017 | void hugetlb_change_protection(struct vm_area_struct *vma, | 3010 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
3018 | unsigned long address, unsigned long end, pgprot_t newprot) | 3011 | unsigned long address, unsigned long end, pgprot_t newprot) |
3019 | { | 3012 | { |
3020 | struct mm_struct *mm = vma->vm_mm; | 3013 | struct mm_struct *mm = vma->vm_mm; |
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3022 | pte_t *ptep; | 3015 | pte_t *ptep; |
3023 | pte_t pte; | 3016 | pte_t pte; |
3024 | struct hstate *h = hstate_vma(vma); | 3017 | struct hstate *h = hstate_vma(vma); |
3018 | unsigned long pages = 0; | ||
3025 | 3019 | ||
3026 | BUG_ON(address >= end); | 3020 | BUG_ON(address >= end); |
3027 | flush_cache_range(vma, address, end); | 3021 | flush_cache_range(vma, address, end); |
@@ -3032,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3032 | ptep = huge_pte_offset(mm, address); | 3026 | ptep = huge_pte_offset(mm, address); |
3033 | if (!ptep) | 3027 | if (!ptep) |
3034 | continue; | 3028 | continue; |
3035 | if (huge_pmd_unshare(mm, &address, ptep)) | 3029 | if (huge_pmd_unshare(mm, &address, ptep)) { |
3030 | pages++; | ||
3036 | continue; | 3031 | continue; |
3032 | } | ||
3037 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3033 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
3038 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3034 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3039 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 3035 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
3040 | set_huge_pte_at(mm, address, ptep, pte); | 3036 | set_huge_pte_at(mm, address, ptep, pte); |
3037 | pages++; | ||
3041 | } | 3038 | } |
3042 | } | 3039 | } |
3043 | spin_unlock(&mm->page_table_lock); | 3040 | spin_unlock(&mm->page_table_lock); |
@@ -3049,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3049 | */ | 3046 | */ |
3050 | flush_tlb_range(vma, start, end); | 3047 | flush_tlb_range(vma, start, end); |
3051 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3048 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
3049 | |||
3050 | return pages << h->order; | ||
3052 | } | 3051 | } |
3053 | 3052 | ||
3054 | int hugetlb_reserve_pages(struct inode *inode, | 3053 | int hugetlb_reserve_pages(struct inode *inode, |
@@ -3170,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
3170 | 3169 | ||
3171 | spin_lock(&hugetlb_lock); | 3170 | spin_lock(&hugetlb_lock); |
3172 | if (is_hugepage_on_freelist(hpage)) { | 3171 | if (is_hugepage_on_freelist(hpage)) { |
3173 | list_del(&hpage->lru); | 3172 | /* |
3173 | * Hwpoisoned hugepage isn't linked to activelist or freelist, | ||
3174 | * but dangling hpage->lru can trigger list-debug warnings | ||
3175 | * (this happens when we call unpoison_memory() on it), | ||
3176 | * so let it point to itself with list_del_init(). | ||
3177 | */ | ||
3178 | list_del_init(&hpage->lru); | ||
3174 | set_page_refcounted(hpage); | 3179 | set_page_refcounted(hpage); |
3175 | h->free_huge_pages--; | 3180 | h->free_huge_pages--; |
3176 | h->free_huge_pages_node[nid]--; | 3181 | h->free_huge_pages_node[nid]--; |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a3f358fb8a0c..9cea7de22ffb 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) | |||
77 | return false; | 77 | return false; |
78 | } | 78 | } |
79 | 79 | ||
80 | static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | 80 | static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) |
81 | { | 81 | { |
82 | int idx; | 82 | int idx; |
83 | struct cgroup *parent_cgroup; | 83 | struct cgroup *parent_cgroup; |
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | |||
101 | return &h_cgroup->css; | 101 | return &h_cgroup->css; |
102 | } | 102 | } |
103 | 103 | ||
104 | static void hugetlb_cgroup_destroy(struct cgroup *cgroup) | 104 | static void hugetlb_cgroup_css_free(struct cgroup *cgroup) |
105 | { | 105 | { |
106 | struct hugetlb_cgroup *h_cgroup; | 106 | struct hugetlb_cgroup *h_cgroup; |
107 | 107 | ||
@@ -155,18 +155,13 @@ out: | |||
155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | 155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to |
156 | * the parent cgroup. | 156 | * the parent cgroup. |
157 | */ | 157 | */ |
158 | static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | 158 | static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) |
159 | { | 159 | { |
160 | struct hstate *h; | 160 | struct hstate *h; |
161 | struct page *page; | 161 | struct page *page; |
162 | int ret = 0, idx = 0; | 162 | int idx = 0; |
163 | 163 | ||
164 | do { | 164 | do { |
165 | if (cgroup_task_count(cgroup) || | ||
166 | !list_empty(&cgroup->children)) { | ||
167 | ret = -EBUSY; | ||
168 | goto out; | ||
169 | } | ||
170 | for_each_hstate(h) { | 165 | for_each_hstate(h) { |
171 | spin_lock(&hugetlb_lock); | 166 | spin_lock(&hugetlb_lock); |
172 | list_for_each_entry(page, &h->hugepage_activelist, lru) | 167 | list_for_each_entry(page, &h->hugepage_activelist, lru) |
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | |||
177 | } | 172 | } |
178 | cond_resched(); | 173 | cond_resched(); |
179 | } while (hugetlb_cgroup_have_usage(cgroup)); | 174 | } while (hugetlb_cgroup_have_usage(cgroup)); |
180 | out: | ||
181 | return ret; | ||
182 | } | 175 | } |
183 | 176 | ||
184 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | 177 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, |
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) | |||
340 | return buf; | 333 | return buf; |
341 | } | 334 | } |
342 | 335 | ||
343 | int __init hugetlb_cgroup_file_init(int idx) | 336 | static void __init __hugetlb_cgroup_file_init(int idx) |
344 | { | 337 | { |
345 | char buf[32]; | 338 | char buf[32]; |
346 | struct cftype *cft; | 339 | struct cftype *cft; |
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx) | |||
382 | 375 | ||
383 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); | 376 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); |
384 | 377 | ||
385 | return 0; | 378 | return; |
379 | } | ||
380 | |||
381 | void __init hugetlb_cgroup_file_init(void) | ||
382 | { | ||
383 | struct hstate *h; | ||
384 | |||
385 | for_each_hstate(h) { | ||
386 | /* | ||
387 | * Add cgroup control files only if the huge page consists | ||
388 | * of more than two normal pages. This is because we use | ||
389 | * page[2].lru.next for storing cgroup details. | ||
390 | */ | ||
391 | if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) | ||
392 | __hugetlb_cgroup_file_init(hstate_index(h)); | ||
393 | } | ||
386 | } | 394 | } |
387 | 395 | ||
388 | /* | 396 | /* |
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | |||
411 | 419 | ||
412 | struct cgroup_subsys hugetlb_subsys = { | 420 | struct cgroup_subsys hugetlb_subsys = { |
413 | .name = "hugetlb", | 421 | .name = "hugetlb", |
414 | .create = hugetlb_cgroup_create, | 422 | .css_alloc = hugetlb_cgroup_css_alloc, |
415 | .pre_destroy = hugetlb_cgroup_pre_destroy, | 423 | .css_offline = hugetlb_cgroup_css_offline, |
416 | .destroy = hugetlb_cgroup_destroy, | 424 | .css_free = hugetlb_cgroup_css_free, |
417 | .subsys_id = hugetlb_subsys_id, | 425 | .subsys_id = hugetlb_subsys_id, |
418 | }; | 426 | }; |
diff --git a/mm/internal.h b/mm/internal.h index a4fa284f6bc2..9ba21100ebf3 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page); | |||
92 | extern void putback_lru_page(struct page *page); | 92 | extern void putback_lru_page(struct page *page); |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * in mm/rmap.c: | ||
96 | */ | ||
97 | extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); | ||
98 | |||
99 | /* | ||
95 | * in mm/page_alloc.c | 100 | * in mm/page_alloc.c |
96 | */ | 101 | */ |
97 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 102 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
@@ -130,7 +135,6 @@ struct compact_control { | |||
130 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 135 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
131 | struct zone *zone; | 136 | struct zone *zone; |
132 | bool contended; /* True if a lock was contended */ | 137 | bool contended; /* True if a lock was contended */ |
133 | struct page **page; /* Page captured of requested size */ | ||
134 | }; | 138 | }; |
135 | 139 | ||
136 | unsigned long | 140 | unsigned long |
@@ -212,15 +216,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
212 | { | 216 | { |
213 | if (TestClearPageMlocked(page)) { | 217 | if (TestClearPageMlocked(page)) { |
214 | unsigned long flags; | 218 | unsigned long flags; |
219 | int nr_pages = hpage_nr_pages(page); | ||
215 | 220 | ||
216 | local_irq_save(flags); | 221 | local_irq_save(flags); |
217 | __dec_zone_page_state(page, NR_MLOCK); | 222 | __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
218 | SetPageMlocked(newpage); | 223 | SetPageMlocked(newpage); |
219 | __inc_zone_page_state(newpage, NR_MLOCK); | 224 | __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); |
220 | local_irq_restore(flags); | 225 | local_irq_restore(flags); |
221 | } | 226 | } |
222 | } | 227 | } |
223 | 228 | ||
229 | extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); | ||
230 | |||
224 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 231 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
225 | extern unsigned long vma_address(struct page *page, | 232 | extern unsigned long vma_address(struct page *page, |
226 | struct vm_area_struct *vma); | 233 | struct vm_area_struct *vma); |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a217cc544060..752a705c77c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str) | |||
1556 | struct kmemleak_object *object; | 1556 | struct kmemleak_object *object; |
1557 | unsigned long addr; | 1557 | unsigned long addr; |
1558 | 1558 | ||
1559 | addr= simple_strtoul(str, NULL, 0); | 1559 | if (kstrtoul(str, 0, &addr)) |
1560 | return -EINVAL; | ||
1560 | object = find_and_get_object(addr, 0); | 1561 | object = find_and_get_object(addr, 0); |
1561 | if (!object) { | 1562 | if (!object) { |
1562 | pr_info("Unknown object at 0x%08lx\n", addr); | 1563 | pr_info("Unknown object at 0x%08lx\n", addr); |
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
778 | struct page *kpage, pte_t orig_pte) | 778 | struct page *kpage, pte_t orig_pte) |
779 | { | 779 | { |
780 | struct mm_struct *mm = vma->vm_mm; | 780 | struct mm_struct *mm = vma->vm_mm; |
781 | pgd_t *pgd; | ||
782 | pud_t *pud; | ||
783 | pmd_t *pmd; | 781 | pmd_t *pmd; |
784 | pte_t *ptep; | 782 | pte_t *ptep; |
785 | spinlock_t *ptl; | 783 | spinlock_t *ptl; |
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
792 | if (addr == -EFAULT) | 790 | if (addr == -EFAULT) |
793 | goto out; | 791 | goto out; |
794 | 792 | ||
795 | pgd = pgd_offset(mm, addr); | 793 | pmd = mm_find_pmd(mm, addr); |
796 | if (!pgd_present(*pgd)) | 794 | if (!pmd) |
797 | goto out; | 795 | goto out; |
798 | |||
799 | pud = pud_offset(pgd, addr); | ||
800 | if (!pud_present(*pud)) | ||
801 | goto out; | ||
802 | |||
803 | pmd = pmd_offset(pud, addr); | ||
804 | BUG_ON(pmd_trans_huge(*pmd)); | 796 | BUG_ON(pmd_trans_huge(*pmd)); |
805 | if (!pmd_present(*pmd)) | ||
806 | goto out; | ||
807 | 797 | ||
808 | mmun_start = addr; | 798 | mmun_start = addr; |
809 | mmun_end = addr + PAGE_SIZE; | 799 | mmun_end = addr + PAGE_SIZE; |
@@ -1634,7 +1624,7 @@ again: | |||
1634 | struct anon_vma_chain *vmac; | 1624 | struct anon_vma_chain *vmac; |
1635 | struct vm_area_struct *vma; | 1625 | struct vm_area_struct *vma; |
1636 | 1626 | ||
1637 | anon_vma_lock(anon_vma); | 1627 | anon_vma_lock_read(anon_vma); |
1638 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1628 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1639 | 0, ULONG_MAX) { | 1629 | 0, ULONG_MAX) { |
1640 | vma = vmac->vma; | 1630 | vma = vmac->vma; |
@@ -1658,7 +1648,7 @@ again: | |||
1658 | if (!search_new_forks || !mapcount) | 1648 | if (!search_new_forks || !mapcount) |
1659 | break; | 1649 | break; |
1660 | } | 1650 | } |
1661 | anon_vma_unlock(anon_vma); | 1651 | anon_vma_unlock_read(anon_vma); |
1662 | if (!mapcount) | 1652 | if (!mapcount) |
1663 | goto out; | 1653 | goto out; |
1664 | } | 1654 | } |
@@ -1688,7 +1678,7 @@ again: | |||
1688 | struct anon_vma_chain *vmac; | 1678 | struct anon_vma_chain *vmac; |
1689 | struct vm_area_struct *vma; | 1679 | struct vm_area_struct *vma; |
1690 | 1680 | ||
1691 | anon_vma_lock(anon_vma); | 1681 | anon_vma_lock_read(anon_vma); |
1692 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1682 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1693 | 0, ULONG_MAX) { | 1683 | 0, ULONG_MAX) { |
1694 | vma = vmac->vma; | 1684 | vma = vmac->vma; |
@@ -1707,11 +1697,11 @@ again: | |||
1707 | ret = try_to_unmap_one(page, vma, | 1697 | ret = try_to_unmap_one(page, vma, |
1708 | rmap_item->address, flags); | 1698 | rmap_item->address, flags); |
1709 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | 1699 | if (ret != SWAP_AGAIN || !page_mapped(page)) { |
1710 | anon_vma_unlock(anon_vma); | 1700 | anon_vma_unlock_read(anon_vma); |
1711 | goto out; | 1701 | goto out; |
1712 | } | 1702 | } |
1713 | } | 1703 | } |
1714 | anon_vma_unlock(anon_vma); | 1704 | anon_vma_unlock_read(anon_vma); |
1715 | } | 1705 | } |
1716 | if (!search_new_forks++) | 1706 | if (!search_new_forks++) |
1717 | goto again; | 1707 | goto again; |
@@ -1741,7 +1731,7 @@ again: | |||
1741 | struct anon_vma_chain *vmac; | 1731 | struct anon_vma_chain *vmac; |
1742 | struct vm_area_struct *vma; | 1732 | struct vm_area_struct *vma; |
1743 | 1733 | ||
1744 | anon_vma_lock(anon_vma); | 1734 | anon_vma_lock_read(anon_vma); |
1745 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1735 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1746 | 0, ULONG_MAX) { | 1736 | 0, ULONG_MAX) { |
1747 | vma = vmac->vma; | 1737 | vma = vmac->vma; |
@@ -1759,11 +1749,11 @@ again: | |||
1759 | 1749 | ||
1760 | ret = rmap_one(page, vma, rmap_item->address, arg); | 1750 | ret = rmap_one(page, vma, rmap_item->address, arg); |
1761 | if (ret != SWAP_AGAIN) { | 1751 | if (ret != SWAP_AGAIN) { |
1762 | anon_vma_unlock(anon_vma); | 1752 | anon_vma_unlock_read(anon_vma); |
1763 | goto out; | 1753 | goto out; |
1764 | } | 1754 | } |
1765 | } | 1755 | } |
1766 | anon_vma_unlock(anon_vma); | 1756 | anon_vma_unlock_read(anon_vma); |
1767 | } | 1757 | } |
1768 | if (!search_new_forks++) | 1758 | if (!search_new_forks++) |
1769 | goto again; | 1759 | goto again; |
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1929 | if (ksm_run != flags) { | 1919 | if (ksm_run != flags) { |
1930 | ksm_run = flags; | 1920 | ksm_run = flags; |
1931 | if (flags & KSM_RUN_UNMERGE) { | 1921 | if (flags & KSM_RUN_UNMERGE) { |
1932 | int oom_score_adj; | 1922 | set_current_oom_origin(); |
1933 | |||
1934 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | ||
1935 | err = unmerge_and_remove_all_rmap_items(); | 1923 | err = unmerge_and_remove_all_rmap_items(); |
1936 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, | 1924 | clear_current_oom_origin(); |
1937 | oom_score_adj); | ||
1938 | if (err) { | 1925 | if (err) { |
1939 | ksm_run = KSM_RUN_STOP; | 1926 | ksm_run = KSM_RUN_STOP; |
1940 | count = err; | 1927 | count = err; |
diff --git a/mm/memblock.c b/mm/memblock.c index 625905523c2a..88adc8afb610 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
314 | } | 314 | } |
315 | 315 | ||
316 | this->size += next->size; | 316 | this->size += next->size; |
317 | memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); | 317 | /* move forward from next + 1, index of which is i + 2 */ |
318 | memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); | ||
318 | type->cnt--; | 319 | type->cnt--; |
319 | } | 320 | } |
320 | } | 321 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b31..09255ec8159c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -10,6 +10,10 @@ | |||
10 | * Copyright (C) 2009 Nokia Corporation | 10 | * Copyright (C) 2009 Nokia Corporation |
11 | * Author: Kirill A. Shutemov | 11 | * Author: Kirill A. Shutemov |
12 | * | 12 | * |
13 | * Kernel Memory Controller | ||
14 | * Copyright (C) 2012 Parallels Inc. and Google Inc. | ||
15 | * Authors: Glauber Costa and Suleiman Souhlal | ||
16 | * | ||
13 | * This program is free software; you can redistribute it and/or modify | 17 | * This program is free software; you can redistribute it and/or modify |
14 | * it under the terms of the GNU General Public License as published by | 18 | * it under the terms of the GNU General Public License as published by |
15 | * the Free Software Foundation; either version 2 of the License, or | 19 | * the Free Software Foundation; either version 2 of the License, or |
@@ -59,6 +63,8 @@ | |||
59 | #include <trace/events/vmscan.h> | 63 | #include <trace/events/vmscan.h> |
60 | 64 | ||
61 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 65 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
66 | EXPORT_SYMBOL(mem_cgroup_subsys); | ||
67 | |||
62 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 68 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
63 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 69 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
64 | 70 | ||
@@ -266,6 +272,10 @@ struct mem_cgroup { | |||
266 | }; | 272 | }; |
267 | 273 | ||
268 | /* | 274 | /* |
275 | * the counter to account for kernel memory usage. | ||
276 | */ | ||
277 | struct res_counter kmem; | ||
278 | /* | ||
269 | * Per cgroup active and inactive list, similar to the | 279 | * Per cgroup active and inactive list, similar to the |
270 | * per zone LRU lists. | 280 | * per zone LRU lists. |
271 | */ | 281 | */ |
@@ -280,6 +290,7 @@ struct mem_cgroup { | |||
280 | * Should the accounting and control be hierarchical, per subtree? | 290 | * Should the accounting and control be hierarchical, per subtree? |
281 | */ | 291 | */ |
282 | bool use_hierarchy; | 292 | bool use_hierarchy; |
293 | unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ | ||
283 | 294 | ||
284 | bool oom_lock; | 295 | bool oom_lock; |
285 | atomic_t under_oom; | 296 | atomic_t under_oom; |
@@ -330,8 +341,61 @@ struct mem_cgroup { | |||
330 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | 341 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
331 | struct tcp_memcontrol tcp_mem; | 342 | struct tcp_memcontrol tcp_mem; |
332 | #endif | 343 | #endif |
344 | #if defined(CONFIG_MEMCG_KMEM) | ||
345 | /* analogous to slab_common's slab_caches list. per-memcg */ | ||
346 | struct list_head memcg_slab_caches; | ||
347 | /* Not a spinlock, we can take a lot of time walking the list */ | ||
348 | struct mutex slab_caches_mutex; | ||
349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | ||
350 | int kmemcg_id; | ||
351 | #endif | ||
333 | }; | 352 | }; |
334 | 353 | ||
354 | /* internal only representation about the status of kmem accounting. */ | ||
355 | enum { | ||
356 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ | ||
357 | KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ | ||
358 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ | ||
359 | }; | ||
360 | |||
361 | /* We account when limit is on, but only after call sites are patched */ | ||
362 | #define KMEM_ACCOUNTED_MASK \ | ||
363 | ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) | ||
364 | |||
365 | #ifdef CONFIG_MEMCG_KMEM | ||
366 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | ||
367 | { | ||
368 | set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | ||
369 | } | ||
370 | |||
371 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | ||
372 | { | ||
373 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | ||
374 | } | ||
375 | |||
376 | static void memcg_kmem_set_activated(struct mem_cgroup *memcg) | ||
377 | { | ||
378 | set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
379 | } | ||
380 | |||
381 | static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) | ||
382 | { | ||
383 | clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
384 | } | ||
385 | |||
386 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | ||
387 | { | ||
388 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) | ||
389 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); | ||
390 | } | ||
391 | |||
392 | static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | ||
393 | { | ||
394 | return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, | ||
395 | &memcg->kmem_account_flags); | ||
396 | } | ||
397 | #endif | ||
398 | |||
335 | /* Stuffs for move charges at task migration. */ | 399 | /* Stuffs for move charges at task migration. */ |
336 | /* | 400 | /* |
337 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | 401 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a |
@@ -386,9 +450,13 @@ enum charge_type { | |||
386 | }; | 450 | }; |
387 | 451 | ||
388 | /* for encoding cft->private value on file */ | 452 | /* for encoding cft->private value on file */ |
389 | #define _MEM (0) | 453 | enum res_type { |
390 | #define _MEMSWAP (1) | 454 | _MEM, |
391 | #define _OOM_TYPE (2) | 455 | _MEMSWAP, |
456 | _OOM_TYPE, | ||
457 | _KMEM, | ||
458 | }; | ||
459 | |||
392 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) | 460 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) |
393 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) | 461 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) |
394 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 462 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
@@ -485,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) | |||
485 | } | 553 | } |
486 | #endif | 554 | #endif |
487 | 555 | ||
556 | #ifdef CONFIG_MEMCG_KMEM | ||
557 | /* | ||
558 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. | ||
559 | * There are two main reasons for not using the css_id for this: | ||
560 | * 1) this works better in sparse environments, where we have a lot of memcgs, | ||
561 | * but only a few kmem-limited. Or also, if we have, for instance, 200 | ||
562 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a | ||
563 | * 200 entry array for that. | ||
564 | * | ||
565 | * 2) In order not to violate the cgroup API, we would like to do all memory | ||
566 | * allocation in ->create(). At that point, we haven't yet allocated the | ||
567 | * css_id. Having a separate index prevents us from messing with the cgroup | ||
568 | * core for this | ||
569 | * | ||
570 | * The current size of the caches array is stored in | ||
571 | * memcg_limited_groups_array_size. It will double each time we have to | ||
572 | * increase it. | ||
573 | */ | ||
574 | static DEFINE_IDA(kmem_limited_groups); | ||
575 | int memcg_limited_groups_array_size; | ||
576 | |||
577 | /* | ||
578 | * MIN_SIZE is different than 1, because we would like to avoid going through | ||
579 | * the alloc/free process all the time. In a small machine, 4 kmem-limited | ||
580 | * cgroups is a reasonable guess. In the future, it could be a parameter or | ||
581 | * tunable, but that is strictly not necessary. | ||
582 | * | ||
583 | * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get | ||
584 | * this constant directly from cgroup, but it is understandable that this is | ||
585 | * better kept as an internal representation in cgroup.c. In any case, the | ||
586 | * css_id space is not getting any smaller, and we don't have to necessarily | ||
587 | * increase ours as well if it increases. | ||
588 | */ | ||
589 | #define MEMCG_CACHES_MIN_SIZE 4 | ||
590 | #define MEMCG_CACHES_MAX_SIZE 65535 | ||
591 | |||
592 | /* | ||
593 | * A lot of the calls to the cache allocation functions are expected to be | ||
594 | * inlined by the compiler. Since the calls to memcg_kmem_get_cache are | ||
595 | * conditional to this static branch, we'll have to allow modules that does | ||
596 | * kmem_cache_alloc and the such to see this symbol as well | ||
597 | */ | ||
598 | struct static_key memcg_kmem_enabled_key; | ||
599 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | ||
600 | |||
601 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
602 | { | ||
603 | if (memcg_kmem_is_active(memcg)) { | ||
604 | static_key_slow_dec(&memcg_kmem_enabled_key); | ||
605 | ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); | ||
606 | } | ||
607 | /* | ||
608 | * This check can't live in kmem destruction function, | ||
609 | * since the charges will outlive the cgroup | ||
610 | */ | ||
611 | WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); | ||
612 | } | ||
613 | #else | ||
614 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
615 | { | ||
616 | } | ||
617 | #endif /* CONFIG_MEMCG_KMEM */ | ||
618 | |||
619 | static void disarm_static_keys(struct mem_cgroup *memcg) | ||
620 | { | ||
621 | disarm_sock_keys(memcg); | ||
622 | disarm_kmem_keys(memcg); | ||
623 | } | ||
624 | |||
488 | static void drain_all_stock_async(struct mem_cgroup *memcg); | 625 | static void drain_all_stock_async(struct mem_cgroup *memcg); |
489 | 626 | ||
490 | static struct mem_cgroup_per_zone * | 627 | static struct mem_cgroup_per_zone * |
@@ -800,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | |||
800 | int nid; | 937 | int nid; |
801 | u64 total = 0; | 938 | u64 total = 0; |
802 | 939 | ||
803 | for_each_node_state(nid, N_HIGH_MEMORY) | 940 | for_each_node_state(nid, N_MEMORY) |
804 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); | 941 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); |
805 | return total; | 942 | return total; |
806 | } | 943 | } |
@@ -1015,13 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
1015 | iter != NULL; \ | 1152 | iter != NULL; \ |
1016 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1153 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
1017 | 1154 | ||
1018 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1155 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
1019 | { | 1156 | { |
1020 | struct mem_cgroup *memcg; | 1157 | struct mem_cgroup *memcg; |
1021 | 1158 | ||
1022 | if (!mm) | ||
1023 | return; | ||
1024 | |||
1025 | rcu_read_lock(); | 1159 | rcu_read_lock(); |
1026 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1160 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
1027 | if (unlikely(!memcg)) | 1161 | if (unlikely(!memcg)) |
@@ -1040,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | |||
1040 | out: | 1174 | out: |
1041 | rcu_read_unlock(); | 1175 | rcu_read_unlock(); |
1042 | } | 1176 | } |
1043 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); | 1177 | EXPORT_SYMBOL(__mem_cgroup_count_vm_event); |
1044 | 1178 | ||
1045 | /** | 1179 | /** |
1046 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 1180 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
@@ -1454,6 +1588,10 @@ done: | |||
1454 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1588 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, |
1455 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1589 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, |
1456 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1590 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); |
1591 | printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", | ||
1592 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, | ||
1593 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, | ||
1594 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); | ||
1457 | } | 1595 | } |
1458 | 1596 | ||
1459 | /* | 1597 | /* |
@@ -1498,8 +1636,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1498 | return limit; | 1636 | return limit; |
1499 | } | 1637 | } |
1500 | 1638 | ||
1501 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | 1639 | static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
1502 | int order) | 1640 | int order) |
1503 | { | 1641 | { |
1504 | struct mem_cgroup *iter; | 1642 | struct mem_cgroup *iter; |
1505 | unsigned long chosen_points = 0; | 1643 | unsigned long chosen_points = 0; |
@@ -1644,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) | |||
1644 | return; | 1782 | return; |
1645 | 1783 | ||
1646 | /* make a nodemask where this memcg uses memory from */ | 1784 | /* make a nodemask where this memcg uses memory from */ |
1647 | memcg->scan_nodes = node_states[N_HIGH_MEMORY]; | 1785 | memcg->scan_nodes = node_states[N_MEMORY]; |
1648 | 1786 | ||
1649 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1787 | for_each_node_mask(nid, node_states[N_MEMORY]) { |
1650 | 1788 | ||
1651 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) | 1789 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) |
1652 | node_clear(nid, memcg->scan_nodes); | 1790 | node_clear(nid, memcg->scan_nodes); |
@@ -1717,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | |||
1717 | /* | 1855 | /* |
1718 | * Check rest of nodes. | 1856 | * Check rest of nodes. |
1719 | */ | 1857 | */ |
1720 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1858 | for_each_node_state(nid, N_MEMORY) { |
1721 | if (node_isset(nid, memcg->scan_nodes)) | 1859 | if (node_isset(nid, memcg->scan_nodes)) |
1722 | continue; | 1860 | continue; |
1723 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | 1861 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
@@ -2061,20 +2199,28 @@ struct memcg_stock_pcp { | |||
2061 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 2199 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
2062 | static DEFINE_MUTEX(percpu_charge_mutex); | 2200 | static DEFINE_MUTEX(percpu_charge_mutex); |
2063 | 2201 | ||
2064 | /* | 2202 | /** |
2065 | * Try to consume stocked charge on this cpu. If success, one page is consumed | 2203 | * consume_stock: Try to consume stocked charge on this cpu. |
2066 | * from local stock and true is returned. If the stock is 0 or charges from a | 2204 | * @memcg: memcg to consume from. |
2067 | * cgroup which is not current target, returns false. This stock will be | 2205 | * @nr_pages: how many pages to charge. |
2068 | * refilled. | 2206 | * |
2207 | * The charges will only happen if @memcg matches the current cpu's memcg | ||
2208 | * stock, and at least @nr_pages are available in that stock. Failure to | ||
2209 | * service an allocation will refill the stock. | ||
2210 | * | ||
2211 | * returns true if successful, false otherwise. | ||
2069 | */ | 2212 | */ |
2070 | static bool consume_stock(struct mem_cgroup *memcg) | 2213 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2071 | { | 2214 | { |
2072 | struct memcg_stock_pcp *stock; | 2215 | struct memcg_stock_pcp *stock; |
2073 | bool ret = true; | 2216 | bool ret = true; |
2074 | 2217 | ||
2218 | if (nr_pages > CHARGE_BATCH) | ||
2219 | return false; | ||
2220 | |||
2075 | stock = &get_cpu_var(memcg_stock); | 2221 | stock = &get_cpu_var(memcg_stock); |
2076 | if (memcg == stock->cached && stock->nr_pages) | 2222 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) |
2077 | stock->nr_pages--; | 2223 | stock->nr_pages -= nr_pages; |
2078 | else /* need to call res_counter_charge */ | 2224 | else /* need to call res_counter_charge */ |
2079 | ret = false; | 2225 | ret = false; |
2080 | put_cpu_var(memcg_stock); | 2226 | put_cpu_var(memcg_stock); |
@@ -2251,7 +2397,8 @@ enum { | |||
2251 | }; | 2397 | }; |
2252 | 2398 | ||
2253 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2399 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2254 | unsigned int nr_pages, bool oom_check) | 2400 | unsigned int nr_pages, unsigned int min_pages, |
2401 | bool oom_check) | ||
2255 | { | 2402 | { |
2256 | unsigned long csize = nr_pages * PAGE_SIZE; | 2403 | unsigned long csize = nr_pages * PAGE_SIZE; |
2257 | struct mem_cgroup *mem_over_limit; | 2404 | struct mem_cgroup *mem_over_limit; |
@@ -2274,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2274 | } else | 2421 | } else |
2275 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 2422 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
2276 | /* | 2423 | /* |
2277 | * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch | ||
2278 | * of regular pages (CHARGE_BATCH), or a single regular page (1). | ||
2279 | * | ||
2280 | * Never reclaim on behalf of optional batching, retry with a | 2424 | * Never reclaim on behalf of optional batching, retry with a |
2281 | * single page instead. | 2425 | * single page instead. |
2282 | */ | 2426 | */ |
2283 | if (nr_pages == CHARGE_BATCH) | 2427 | if (nr_pages > min_pages) |
2284 | return CHARGE_RETRY; | 2428 | return CHARGE_RETRY; |
2285 | 2429 | ||
2286 | if (!(gfp_mask & __GFP_WAIT)) | 2430 | if (!(gfp_mask & __GFP_WAIT)) |
2287 | return CHARGE_WOULDBLOCK; | 2431 | return CHARGE_WOULDBLOCK; |
2288 | 2432 | ||
2433 | if (gfp_mask & __GFP_NORETRY) | ||
2434 | return CHARGE_NOMEM; | ||
2435 | |||
2289 | ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); | 2436 | ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); |
2290 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2437 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2291 | return CHARGE_RETRY; | 2438 | return CHARGE_RETRY; |
@@ -2298,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2298 | * unlikely to succeed so close to the limit, and we fall back | 2445 | * unlikely to succeed so close to the limit, and we fall back |
2299 | * to regular pages anyway in case of failure. | 2446 | * to regular pages anyway in case of failure. |
2300 | */ | 2447 | */ |
2301 | if (nr_pages == 1 && ret) | 2448 | if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) |
2302 | return CHARGE_RETRY; | 2449 | return CHARGE_RETRY; |
2303 | 2450 | ||
2304 | /* | 2451 | /* |
@@ -2370,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2370 | again: | 2517 | again: |
2371 | if (*ptr) { /* css should be a valid one */ | 2518 | if (*ptr) { /* css should be a valid one */ |
2372 | memcg = *ptr; | 2519 | memcg = *ptr; |
2373 | VM_BUG_ON(css_is_removed(&memcg->css)); | ||
2374 | if (mem_cgroup_is_root(memcg)) | 2520 | if (mem_cgroup_is_root(memcg)) |
2375 | goto done; | 2521 | goto done; |
2376 | if (nr_pages == 1 && consume_stock(memcg)) | 2522 | if (consume_stock(memcg, nr_pages)) |
2377 | goto done; | 2523 | goto done; |
2378 | css_get(&memcg->css); | 2524 | css_get(&memcg->css); |
2379 | } else { | 2525 | } else { |
@@ -2398,7 +2544,7 @@ again: | |||
2398 | rcu_read_unlock(); | 2544 | rcu_read_unlock(); |
2399 | goto done; | 2545 | goto done; |
2400 | } | 2546 | } |
2401 | if (nr_pages == 1 && consume_stock(memcg)) { | 2547 | if (consume_stock(memcg, nr_pages)) { |
2402 | /* | 2548 | /* |
2403 | * It seems dagerous to access memcg without css_get(). | 2549 | * It seems dagerous to access memcg without css_get(). |
2404 | * But considering how consume_stok works, it's not | 2550 | * But considering how consume_stok works, it's not |
@@ -2433,7 +2579,8 @@ again: | |||
2433 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2579 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2434 | } | 2580 | } |
2435 | 2581 | ||
2436 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); | 2582 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, |
2583 | oom_check); | ||
2437 | switch (ret) { | 2584 | switch (ret) { |
2438 | case CHARGE_OK: | 2585 | case CHARGE_OK: |
2439 | break; | 2586 | break; |
@@ -2510,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | |||
2510 | 2657 | ||
2511 | /* | 2658 | /* |
2512 | * A helper function to get mem_cgroup from ID. must be called under | 2659 | * A helper function to get mem_cgroup from ID. must be called under |
2513 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2660 | * rcu_read_lock(). The caller is responsible for calling css_tryget if |
2514 | * it's concern. (dropping refcnt from swap can be called against removed | 2661 | * the mem_cgroup is used for charging. (dropping refcnt from swap can be |
2515 | * memcg.) | 2662 | * called against removed memcg.) |
2516 | */ | 2663 | */ |
2517 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | 2664 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) |
2518 | { | 2665 | { |
@@ -2626,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2626 | memcg_check_events(memcg, page); | 2773 | memcg_check_events(memcg, page); |
2627 | } | 2774 | } |
2628 | 2775 | ||
2776 | static DEFINE_MUTEX(set_limit_mutex); | ||
2777 | |||
2778 | #ifdef CONFIG_MEMCG_KMEM | ||
2779 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | ||
2780 | { | ||
2781 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | ||
2782 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); | ||
2783 | } | ||
2784 | |||
2785 | /* | ||
2786 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | ||
2787 | * in the memcg_cache_params struct. | ||
2788 | */ | ||
2789 | static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | ||
2790 | { | ||
2791 | struct kmem_cache *cachep; | ||
2792 | |||
2793 | VM_BUG_ON(p->is_root_cache); | ||
2794 | cachep = p->root_cache; | ||
2795 | return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; | ||
2796 | } | ||
2797 | |||
2798 | #ifdef CONFIG_SLABINFO | ||
2799 | static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, | ||
2800 | struct seq_file *m) | ||
2801 | { | ||
2802 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | ||
2803 | struct memcg_cache_params *params; | ||
2804 | |||
2805 | if (!memcg_can_account_kmem(memcg)) | ||
2806 | return -EIO; | ||
2807 | |||
2808 | print_slabinfo_header(m); | ||
2809 | |||
2810 | mutex_lock(&memcg->slab_caches_mutex); | ||
2811 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) | ||
2812 | cache_show(memcg_params_to_cache(params), m); | ||
2813 | mutex_unlock(&memcg->slab_caches_mutex); | ||
2814 | |||
2815 | return 0; | ||
2816 | } | ||
2817 | #endif | ||
2818 | |||
2819 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | ||
2820 | { | ||
2821 | struct res_counter *fail_res; | ||
2822 | struct mem_cgroup *_memcg; | ||
2823 | int ret = 0; | ||
2824 | bool may_oom; | ||
2825 | |||
2826 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); | ||
2827 | if (ret) | ||
2828 | return ret; | ||
2829 | |||
2830 | /* | ||
2831 | * Conditions under which we can wait for the oom_killer. Those are | ||
2832 | * the same conditions tested by the core page allocator | ||
2833 | */ | ||
2834 | may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); | ||
2835 | |||
2836 | _memcg = memcg; | ||
2837 | ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, | ||
2838 | &_memcg, may_oom); | ||
2839 | |||
2840 | if (ret == -EINTR) { | ||
2841 | /* | ||
2842 | * __mem_cgroup_try_charge() chosed to bypass to root due to | ||
2843 | * OOM kill or fatal signal. Since our only options are to | ||
2844 | * either fail the allocation or charge it to this cgroup, do | ||
2845 | * it as a temporary condition. But we can't fail. From a | ||
2846 | * kmem/slab perspective, the cache has already been selected, | ||
2847 | * by mem_cgroup_kmem_get_cache(), so it is too late to change | ||
2848 | * our minds. | ||
2849 | * | ||
2850 | * This condition will only trigger if the task entered | ||
2851 | * memcg_charge_kmem in a sane state, but was OOM-killed during | ||
2852 | * __mem_cgroup_try_charge() above. Tasks that were already | ||
2853 | * dying when the allocation triggers should have been already | ||
2854 | * directed to the root cgroup in memcontrol.h | ||
2855 | */ | ||
2856 | res_counter_charge_nofail(&memcg->res, size, &fail_res); | ||
2857 | if (do_swap_account) | ||
2858 | res_counter_charge_nofail(&memcg->memsw, size, | ||
2859 | &fail_res); | ||
2860 | ret = 0; | ||
2861 | } else if (ret) | ||
2862 | res_counter_uncharge(&memcg->kmem, size); | ||
2863 | |||
2864 | return ret; | ||
2865 | } | ||
2866 | |||
2867 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) | ||
2868 | { | ||
2869 | res_counter_uncharge(&memcg->res, size); | ||
2870 | if (do_swap_account) | ||
2871 | res_counter_uncharge(&memcg->memsw, size); | ||
2872 | |||
2873 | /* Not down to 0 */ | ||
2874 | if (res_counter_uncharge(&memcg->kmem, size)) | ||
2875 | return; | ||
2876 | |||
2877 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
2878 | mem_cgroup_put(memcg); | ||
2879 | } | ||
2880 | |||
2881 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) | ||
2882 | { | ||
2883 | if (!memcg) | ||
2884 | return; | ||
2885 | |||
2886 | mutex_lock(&memcg->slab_caches_mutex); | ||
2887 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | ||
2888 | mutex_unlock(&memcg->slab_caches_mutex); | ||
2889 | } | ||
2890 | |||
2891 | /* | ||
2892 | * helper for acessing a memcg's index. It will be used as an index in the | ||
2893 | * child cache array in kmem_cache, and also to derive its name. This function | ||
2894 | * will return -1 when this is not a kmem-limited memcg. | ||
2895 | */ | ||
2896 | int memcg_cache_id(struct mem_cgroup *memcg) | ||
2897 | { | ||
2898 | return memcg ? memcg->kmemcg_id : -1; | ||
2899 | } | ||
2900 | |||
2901 | /* | ||
2902 | * This ends up being protected by the set_limit mutex, during normal | ||
2903 | * operation, because that is its main call site. | ||
2904 | * | ||
2905 | * But when we create a new cache, we can call this as well if its parent | ||
2906 | * is kmem-limited. That will have to hold set_limit_mutex as well. | ||
2907 | */ | ||
2908 | int memcg_update_cache_sizes(struct mem_cgroup *memcg) | ||
2909 | { | ||
2910 | int num, ret; | ||
2911 | |||
2912 | num = ida_simple_get(&kmem_limited_groups, | ||
2913 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
2914 | if (num < 0) | ||
2915 | return num; | ||
2916 | /* | ||
2917 | * After this point, kmem_accounted (that we test atomically in | ||
2918 | * the beginning of this conditional), is no longer 0. This | ||
2919 | * guarantees only one process will set the following boolean | ||
2920 | * to true. We don't need test_and_set because we're protected | ||
2921 | * by the set_limit_mutex anyway. | ||
2922 | */ | ||
2923 | memcg_kmem_set_activated(memcg); | ||
2924 | |||
2925 | ret = memcg_update_all_caches(num+1); | ||
2926 | if (ret) { | ||
2927 | ida_simple_remove(&kmem_limited_groups, num); | ||
2928 | memcg_kmem_clear_activated(memcg); | ||
2929 | return ret; | ||
2930 | } | ||
2931 | |||
2932 | memcg->kmemcg_id = num; | ||
2933 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
2934 | mutex_init(&memcg->slab_caches_mutex); | ||
2935 | return 0; | ||
2936 | } | ||
2937 | |||
2938 | static size_t memcg_caches_array_size(int num_groups) | ||
2939 | { | ||
2940 | ssize_t size; | ||
2941 | if (num_groups <= 0) | ||
2942 | return 0; | ||
2943 | |||
2944 | size = 2 * num_groups; | ||
2945 | if (size < MEMCG_CACHES_MIN_SIZE) | ||
2946 | size = MEMCG_CACHES_MIN_SIZE; | ||
2947 | else if (size > MEMCG_CACHES_MAX_SIZE) | ||
2948 | size = MEMCG_CACHES_MAX_SIZE; | ||
2949 | |||
2950 | return size; | ||
2951 | } | ||
2952 | |||
2953 | /* | ||
2954 | * We should update the current array size iff all caches updates succeed. This | ||
2955 | * can only be done from the slab side. The slab mutex needs to be held when | ||
2956 | * calling this. | ||
2957 | */ | ||
2958 | void memcg_update_array_size(int num) | ||
2959 | { | ||
2960 | if (num > memcg_limited_groups_array_size) | ||
2961 | memcg_limited_groups_array_size = memcg_caches_array_size(num); | ||
2962 | } | ||
2963 | |||
2964 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | ||
2965 | { | ||
2966 | struct memcg_cache_params *cur_params = s->memcg_params; | ||
2967 | |||
2968 | VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); | ||
2969 | |||
2970 | if (num_groups > memcg_limited_groups_array_size) { | ||
2971 | int i; | ||
2972 | ssize_t size = memcg_caches_array_size(num_groups); | ||
2973 | |||
2974 | size *= sizeof(void *); | ||
2975 | size += sizeof(struct memcg_cache_params); | ||
2976 | |||
2977 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
2978 | if (!s->memcg_params) { | ||
2979 | s->memcg_params = cur_params; | ||
2980 | return -ENOMEM; | ||
2981 | } | ||
2982 | |||
2983 | s->memcg_params->is_root_cache = true; | ||
2984 | |||
2985 | /* | ||
2986 | * There is the chance it will be bigger than | ||
2987 | * memcg_limited_groups_array_size, if we failed an allocation | ||
2988 | * in a cache, in which case all caches updated before it, will | ||
2989 | * have a bigger array. | ||
2990 | * | ||
2991 | * But if that is the case, the data after | ||
2992 | * memcg_limited_groups_array_size is certainly unused | ||
2993 | */ | ||
2994 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | ||
2995 | if (!cur_params->memcg_caches[i]) | ||
2996 | continue; | ||
2997 | s->memcg_params->memcg_caches[i] = | ||
2998 | cur_params->memcg_caches[i]; | ||
2999 | } | ||
3000 | |||
3001 | /* | ||
3002 | * Ideally, we would wait until all caches succeed, and only | ||
3003 | * then free the old one. But this is not worth the extra | ||
3004 | * pointer per-cache we'd have to have for this. | ||
3005 | * | ||
3006 | * It is not a big deal if some caches are left with a size | ||
3007 | * bigger than the others. And all updates will reset this | ||
3008 | * anyway. | ||
3009 | */ | ||
3010 | kfree(cur_params); | ||
3011 | } | ||
3012 | return 0; | ||
3013 | } | ||
3014 | |||
3015 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | ||
3016 | struct kmem_cache *root_cache) | ||
3017 | { | ||
3018 | size_t size = sizeof(struct memcg_cache_params); | ||
3019 | |||
3020 | if (!memcg_kmem_enabled()) | ||
3021 | return 0; | ||
3022 | |||
3023 | if (!memcg) | ||
3024 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
3025 | |||
3026 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
3027 | if (!s->memcg_params) | ||
3028 | return -ENOMEM; | ||
3029 | |||
3030 | if (memcg) { | ||
3031 | s->memcg_params->memcg = memcg; | ||
3032 | s->memcg_params->root_cache = root_cache; | ||
3033 | } | ||
3034 | return 0; | ||
3035 | } | ||
3036 | |||
3037 | void memcg_release_cache(struct kmem_cache *s) | ||
3038 | { | ||
3039 | struct kmem_cache *root; | ||
3040 | struct mem_cgroup *memcg; | ||
3041 | int id; | ||
3042 | |||
3043 | /* | ||
3044 | * This happens, for instance, when a root cache goes away before we | ||
3045 | * add any memcg. | ||
3046 | */ | ||
3047 | if (!s->memcg_params) | ||
3048 | return; | ||
3049 | |||
3050 | if (s->memcg_params->is_root_cache) | ||
3051 | goto out; | ||
3052 | |||
3053 | memcg = s->memcg_params->memcg; | ||
3054 | id = memcg_cache_id(memcg); | ||
3055 | |||
3056 | root = s->memcg_params->root_cache; | ||
3057 | root->memcg_params->memcg_caches[id] = NULL; | ||
3058 | mem_cgroup_put(memcg); | ||
3059 | |||
3060 | mutex_lock(&memcg->slab_caches_mutex); | ||
3061 | list_del(&s->memcg_params->list); | ||
3062 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3063 | |||
3064 | out: | ||
3065 | kfree(s->memcg_params); | ||
3066 | } | ||
3067 | |||
3068 | /* | ||
3069 | * During the creation a new cache, we need to disable our accounting mechanism | ||
3070 | * altogether. This is true even if we are not creating, but rather just | ||
3071 | * enqueing new caches to be created. | ||
3072 | * | ||
3073 | * This is because that process will trigger allocations; some visible, like | ||
3074 | * explicit kmallocs to auxiliary data structures, name strings and internal | ||
3075 | * cache structures; some well concealed, like INIT_WORK() that can allocate | ||
3076 | * objects during debug. | ||
3077 | * | ||
3078 | * If any allocation happens during memcg_kmem_get_cache, we will recurse back | ||
3079 | * to it. This may not be a bounded recursion: since the first cache creation | ||
3080 | * failed to complete (waiting on the allocation), we'll just try to create the | ||
3081 | * cache again, failing at the same point. | ||
3082 | * | ||
3083 | * memcg_kmem_get_cache is prepared to abort after seeing a positive count of | ||
3084 | * memcg_kmem_skip_account. So we enclose anything that might allocate memory | ||
3085 | * inside the following two functions. | ||
3086 | */ | ||
3087 | static inline void memcg_stop_kmem_account(void) | ||
3088 | { | ||
3089 | VM_BUG_ON(!current->mm); | ||
3090 | current->memcg_kmem_skip_account++; | ||
3091 | } | ||
3092 | |||
3093 | static inline void memcg_resume_kmem_account(void) | ||
3094 | { | ||
3095 | VM_BUG_ON(!current->mm); | ||
3096 | current->memcg_kmem_skip_account--; | ||
3097 | } | ||
3098 | |||
3099 | static void kmem_cache_destroy_work_func(struct work_struct *w) | ||
3100 | { | ||
3101 | struct kmem_cache *cachep; | ||
3102 | struct memcg_cache_params *p; | ||
3103 | |||
3104 | p = container_of(w, struct memcg_cache_params, destroy); | ||
3105 | |||
3106 | cachep = memcg_params_to_cache(p); | ||
3107 | |||
3108 | /* | ||
3109 | * If we get down to 0 after shrink, we could delete right away. | ||
3110 | * However, memcg_release_pages() already puts us back in the workqueue | ||
3111 | * in that case. If we proceed deleting, we'll get a dangling | ||
3112 | * reference, and removing the object from the workqueue in that case | ||
3113 | * is unnecessary complication. We are not a fast path. | ||
3114 | * | ||
3115 | * Note that this case is fundamentally different from racing with | ||
3116 | * shrink_slab(): if memcg_cgroup_destroy_cache() is called in | ||
3117 | * kmem_cache_shrink, not only we would be reinserting a dead cache | ||
3118 | * into the queue, but doing so from inside the worker racing to | ||
3119 | * destroy it. | ||
3120 | * | ||
3121 | * So if we aren't down to zero, we'll just schedule a worker and try | ||
3122 | * again | ||
3123 | */ | ||
3124 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { | ||
3125 | kmem_cache_shrink(cachep); | ||
3126 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | ||
3127 | return; | ||
3128 | } else | ||
3129 | kmem_cache_destroy(cachep); | ||
3130 | } | ||
3131 | |||
3132 | void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | ||
3133 | { | ||
3134 | if (!cachep->memcg_params->dead) | ||
3135 | return; | ||
3136 | |||
3137 | /* | ||
3138 | * There are many ways in which we can get here. | ||
3139 | * | ||
3140 | * We can get to a memory-pressure situation while the delayed work is | ||
3141 | * still pending to run. The vmscan shrinkers can then release all | ||
3142 | * cache memory and get us to destruction. If this is the case, we'll | ||
3143 | * be executed twice, which is a bug (the second time will execute over | ||
3144 | * bogus data). In this case, cancelling the work should be fine. | ||
3145 | * | ||
3146 | * But we can also get here from the worker itself, if | ||
3147 | * kmem_cache_shrink is enough to shake all the remaining objects and | ||
3148 | * get the page count to 0. In this case, we'll deadlock if we try to | ||
3149 | * cancel the work (the worker runs with an internal lock held, which | ||
3150 | * is the same lock we would hold for cancel_work_sync().) | ||
3151 | * | ||
3152 | * Since we can't possibly know who got us here, just refrain from | ||
3153 | * running if there is already work pending | ||
3154 | */ | ||
3155 | if (work_pending(&cachep->memcg_params->destroy)) | ||
3156 | return; | ||
3157 | /* | ||
3158 | * We have to defer the actual destroying to a workqueue, because | ||
3159 | * we might currently be in a context that cannot sleep. | ||
3160 | */ | ||
3161 | schedule_work(&cachep->memcg_params->destroy); | ||
3162 | } | ||
3163 | |||
3164 | static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) | ||
3165 | { | ||
3166 | char *name; | ||
3167 | struct dentry *dentry; | ||
3168 | |||
3169 | rcu_read_lock(); | ||
3170 | dentry = rcu_dereference(memcg->css.cgroup->dentry); | ||
3171 | rcu_read_unlock(); | ||
3172 | |||
3173 | BUG_ON(dentry == NULL); | ||
3174 | |||
3175 | name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, | ||
3176 | memcg_cache_id(memcg), dentry->d_name.name); | ||
3177 | |||
3178 | return name; | ||
3179 | } | ||
3180 | |||
3181 | static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | ||
3182 | struct kmem_cache *s) | ||
3183 | { | ||
3184 | char *name; | ||
3185 | struct kmem_cache *new; | ||
3186 | |||
3187 | name = memcg_cache_name(memcg, s); | ||
3188 | if (!name) | ||
3189 | return NULL; | ||
3190 | |||
3191 | new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, | ||
3192 | (s->flags & ~SLAB_PANIC), s->ctor, s); | ||
3193 | |||
3194 | if (new) | ||
3195 | new->allocflags |= __GFP_KMEMCG; | ||
3196 | |||
3197 | kfree(name); | ||
3198 | return new; | ||
3199 | } | ||
3200 | |||
3201 | /* | ||
3202 | * This lock protects updaters, not readers. We want readers to be as fast as | ||
3203 | * they can, and they will either see NULL or a valid cache value. Our model | ||
3204 | * allow them to see NULL, in which case the root memcg will be selected. | ||
3205 | * | ||
3206 | * We need this lock because multiple allocations to the same cache from a non | ||
3207 | * will span more than one worker. Only one of them can create the cache. | ||
3208 | */ | ||
3209 | static DEFINE_MUTEX(memcg_cache_mutex); | ||
3210 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | ||
3211 | struct kmem_cache *cachep) | ||
3212 | { | ||
3213 | struct kmem_cache *new_cachep; | ||
3214 | int idx; | ||
3215 | |||
3216 | BUG_ON(!memcg_can_account_kmem(memcg)); | ||
3217 | |||
3218 | idx = memcg_cache_id(memcg); | ||
3219 | |||
3220 | mutex_lock(&memcg_cache_mutex); | ||
3221 | new_cachep = cachep->memcg_params->memcg_caches[idx]; | ||
3222 | if (new_cachep) | ||
3223 | goto out; | ||
3224 | |||
3225 | new_cachep = kmem_cache_dup(memcg, cachep); | ||
3226 | if (new_cachep == NULL) { | ||
3227 | new_cachep = cachep; | ||
3228 | goto out; | ||
3229 | } | ||
3230 | |||
3231 | mem_cgroup_get(memcg); | ||
3232 | atomic_set(&new_cachep->memcg_params->nr_pages , 0); | ||
3233 | |||
3234 | cachep->memcg_params->memcg_caches[idx] = new_cachep; | ||
3235 | /* | ||
3236 | * the readers won't lock, make sure everybody sees the updated value, | ||
3237 | * so they won't put stuff in the queue again for no reason | ||
3238 | */ | ||
3239 | wmb(); | ||
3240 | out: | ||
3241 | mutex_unlock(&memcg_cache_mutex); | ||
3242 | return new_cachep; | ||
3243 | } | ||
3244 | |||
3245 | void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | ||
3246 | { | ||
3247 | struct kmem_cache *c; | ||
3248 | int i; | ||
3249 | |||
3250 | if (!s->memcg_params) | ||
3251 | return; | ||
3252 | if (!s->memcg_params->is_root_cache) | ||
3253 | return; | ||
3254 | |||
3255 | /* | ||
3256 | * If the cache is being destroyed, we trust that there is no one else | ||
3257 | * requesting objects from it. Even if there are, the sanity checks in | ||
3258 | * kmem_cache_destroy should caught this ill-case. | ||
3259 | * | ||
3260 | * Still, we don't want anyone else freeing memcg_caches under our | ||
3261 | * noses, which can happen if a new memcg comes to life. As usual, | ||
3262 | * we'll take the set_limit_mutex to protect ourselves against this. | ||
3263 | */ | ||
3264 | mutex_lock(&set_limit_mutex); | ||
3265 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | ||
3266 | c = s->memcg_params->memcg_caches[i]; | ||
3267 | if (!c) | ||
3268 | continue; | ||
3269 | |||
3270 | /* | ||
3271 | * We will now manually delete the caches, so to avoid races | ||
3272 | * we need to cancel all pending destruction workers and | ||
3273 | * proceed with destruction ourselves. | ||
3274 | * | ||
3275 | * kmem_cache_destroy() will call kmem_cache_shrink internally, | ||
3276 | * and that could spawn the workers again: it is likely that | ||
3277 | * the cache still have active pages until this very moment. | ||
3278 | * This would lead us back to mem_cgroup_destroy_cache. | ||
3279 | * | ||
3280 | * But that will not execute at all if the "dead" flag is not | ||
3281 | * set, so flip it down to guarantee we are in control. | ||
3282 | */ | ||
3283 | c->memcg_params->dead = false; | ||
3284 | cancel_work_sync(&c->memcg_params->destroy); | ||
3285 | kmem_cache_destroy(c); | ||
3286 | } | ||
3287 | mutex_unlock(&set_limit_mutex); | ||
3288 | } | ||
3289 | |||
3290 | struct create_work { | ||
3291 | struct mem_cgroup *memcg; | ||
3292 | struct kmem_cache *cachep; | ||
3293 | struct work_struct work; | ||
3294 | }; | ||
3295 | |||
3296 | static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | ||
3297 | { | ||
3298 | struct kmem_cache *cachep; | ||
3299 | struct memcg_cache_params *params; | ||
3300 | |||
3301 | if (!memcg_kmem_is_active(memcg)) | ||
3302 | return; | ||
3303 | |||
3304 | mutex_lock(&memcg->slab_caches_mutex); | ||
3305 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) { | ||
3306 | cachep = memcg_params_to_cache(params); | ||
3307 | cachep->memcg_params->dead = true; | ||
3308 | INIT_WORK(&cachep->memcg_params->destroy, | ||
3309 | kmem_cache_destroy_work_func); | ||
3310 | schedule_work(&cachep->memcg_params->destroy); | ||
3311 | } | ||
3312 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3313 | } | ||
3314 | |||
3315 | static void memcg_create_cache_work_func(struct work_struct *w) | ||
3316 | { | ||
3317 | struct create_work *cw; | ||
3318 | |||
3319 | cw = container_of(w, struct create_work, work); | ||
3320 | memcg_create_kmem_cache(cw->memcg, cw->cachep); | ||
3321 | /* Drop the reference gotten when we enqueued. */ | ||
3322 | css_put(&cw->memcg->css); | ||
3323 | kfree(cw); | ||
3324 | } | ||
3325 | |||
3326 | /* | ||
3327 | * Enqueue the creation of a per-memcg kmem_cache. | ||
3328 | * Called with rcu_read_lock. | ||
3329 | */ | ||
3330 | static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | ||
3331 | struct kmem_cache *cachep) | ||
3332 | { | ||
3333 | struct create_work *cw; | ||
3334 | |||
3335 | cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); | ||
3336 | if (cw == NULL) | ||
3337 | return; | ||
3338 | |||
3339 | /* The corresponding put will be done in the workqueue. */ | ||
3340 | if (!css_tryget(&memcg->css)) { | ||
3341 | kfree(cw); | ||
3342 | return; | ||
3343 | } | ||
3344 | |||
3345 | cw->memcg = memcg; | ||
3346 | cw->cachep = cachep; | ||
3347 | |||
3348 | INIT_WORK(&cw->work, memcg_create_cache_work_func); | ||
3349 | schedule_work(&cw->work); | ||
3350 | } | ||
3351 | |||
3352 | static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, | ||
3353 | struct kmem_cache *cachep) | ||
3354 | { | ||
3355 | /* | ||
3356 | * We need to stop accounting when we kmalloc, because if the | ||
3357 | * corresponding kmalloc cache is not yet created, the first allocation | ||
3358 | * in __memcg_create_cache_enqueue will recurse. | ||
3359 | * | ||
3360 | * However, it is better to enclose the whole function. Depending on | ||
3361 | * the debugging options enabled, INIT_WORK(), for instance, can | ||
3362 | * trigger an allocation. This too, will make us recurse. Because at | ||
3363 | * this point we can't allow ourselves back into memcg_kmem_get_cache, | ||
3364 | * the safest choice is to do it like this, wrapping the whole function. | ||
3365 | */ | ||
3366 | memcg_stop_kmem_account(); | ||
3367 | __memcg_create_cache_enqueue(memcg, cachep); | ||
3368 | memcg_resume_kmem_account(); | ||
3369 | } | ||
3370 | /* | ||
3371 | * Return the kmem_cache we're supposed to use for a slab allocation. | ||
3372 | * We try to use the current memcg's version of the cache. | ||
3373 | * | ||
3374 | * If the cache does not exist yet, if we are the first user of it, | ||
3375 | * we either create it immediately, if possible, or create it asynchronously | ||
3376 | * in a workqueue. | ||
3377 | * In the latter case, we will let the current allocation go through with | ||
3378 | * the original cache. | ||
3379 | * | ||
3380 | * Can't be called in interrupt context or from kernel threads. | ||
3381 | * This function needs to be called with rcu_read_lock() held. | ||
3382 | */ | ||
3383 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | ||
3384 | gfp_t gfp) | ||
3385 | { | ||
3386 | struct mem_cgroup *memcg; | ||
3387 | int idx; | ||
3388 | |||
3389 | VM_BUG_ON(!cachep->memcg_params); | ||
3390 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | ||
3391 | |||
3392 | if (!current->mm || current->memcg_kmem_skip_account) | ||
3393 | return cachep; | ||
3394 | |||
3395 | rcu_read_lock(); | ||
3396 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | ||
3397 | rcu_read_unlock(); | ||
3398 | |||
3399 | if (!memcg_can_account_kmem(memcg)) | ||
3400 | return cachep; | ||
3401 | |||
3402 | idx = memcg_cache_id(memcg); | ||
3403 | |||
3404 | /* | ||
3405 | * barrier to mare sure we're always seeing the up to date value. The | ||
3406 | * code updating memcg_caches will issue a write barrier to match this. | ||
3407 | */ | ||
3408 | read_barrier_depends(); | ||
3409 | if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { | ||
3410 | /* | ||
3411 | * If we are in a safe context (can wait, and not in interrupt | ||
3412 | * context), we could be be predictable and return right away. | ||
3413 | * This would guarantee that the allocation being performed | ||
3414 | * already belongs in the new cache. | ||
3415 | * | ||
3416 | * However, there are some clashes that can arrive from locking. | ||
3417 | * For instance, because we acquire the slab_mutex while doing | ||
3418 | * kmem_cache_dup, this means no further allocation could happen | ||
3419 | * with the slab_mutex held. | ||
3420 | * | ||
3421 | * Also, because cache creation issue get_online_cpus(), this | ||
3422 | * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, | ||
3423 | * that ends up reversed during cpu hotplug. (cpuset allocates | ||
3424 | * a bunch of GFP_KERNEL memory during cpuup). Due to all that, | ||
3425 | * better to defer everything. | ||
3426 | */ | ||
3427 | memcg_create_cache_enqueue(memcg, cachep); | ||
3428 | return cachep; | ||
3429 | } | ||
3430 | |||
3431 | return cachep->memcg_params->memcg_caches[idx]; | ||
3432 | } | ||
3433 | EXPORT_SYMBOL(__memcg_kmem_get_cache); | ||
3434 | |||
3435 | /* | ||
3436 | * We need to verify if the allocation against current->mm->owner's memcg is | ||
3437 | * possible for the given order. But the page is not allocated yet, so we'll | ||
3438 | * need a further commit step to do the final arrangements. | ||
3439 | * | ||
3440 | * It is possible for the task to switch cgroups in this mean time, so at | ||
3441 | * commit time, we can't rely on task conversion any longer. We'll then use | ||
3442 | * the handle argument to return to the caller which cgroup we should commit | ||
3443 | * against. We could also return the memcg directly and avoid the pointer | ||
3444 | * passing, but a boolean return value gives better semantics considering | ||
3445 | * the compiled-out case as well. | ||
3446 | * | ||
3447 | * Returning true means the allocation is possible. | ||
3448 | */ | ||
3449 | bool | ||
3450 | __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | ||
3451 | { | ||
3452 | struct mem_cgroup *memcg; | ||
3453 | int ret; | ||
3454 | |||
3455 | *_memcg = NULL; | ||
3456 | memcg = try_get_mem_cgroup_from_mm(current->mm); | ||
3457 | |||
3458 | /* | ||
3459 | * very rare case described in mem_cgroup_from_task. Unfortunately there | ||
3460 | * isn't much we can do without complicating this too much, and it would | ||
3461 | * be gfp-dependent anyway. Just let it go | ||
3462 | */ | ||
3463 | if (unlikely(!memcg)) | ||
3464 | return true; | ||
3465 | |||
3466 | if (!memcg_can_account_kmem(memcg)) { | ||
3467 | css_put(&memcg->css); | ||
3468 | return true; | ||
3469 | } | ||
3470 | |||
3471 | ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); | ||
3472 | if (!ret) | ||
3473 | *_memcg = memcg; | ||
3474 | |||
3475 | css_put(&memcg->css); | ||
3476 | return (ret == 0); | ||
3477 | } | ||
3478 | |||
3479 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, | ||
3480 | int order) | ||
3481 | { | ||
3482 | struct page_cgroup *pc; | ||
3483 | |||
3484 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | ||
3485 | |||
3486 | /* The page allocation failed. Revert */ | ||
3487 | if (!page) { | ||
3488 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | ||
3489 | return; | ||
3490 | } | ||
3491 | |||
3492 | pc = lookup_page_cgroup(page); | ||
3493 | lock_page_cgroup(pc); | ||
3494 | pc->mem_cgroup = memcg; | ||
3495 | SetPageCgroupUsed(pc); | ||
3496 | unlock_page_cgroup(pc); | ||
3497 | } | ||
3498 | |||
3499 | void __memcg_kmem_uncharge_pages(struct page *page, int order) | ||
3500 | { | ||
3501 | struct mem_cgroup *memcg = NULL; | ||
3502 | struct page_cgroup *pc; | ||
3503 | |||
3504 | |||
3505 | pc = lookup_page_cgroup(page); | ||
3506 | /* | ||
3507 | * Fast unlocked return. Theoretically might have changed, have to | ||
3508 | * check again after locking. | ||
3509 | */ | ||
3510 | if (!PageCgroupUsed(pc)) | ||
3511 | return; | ||
3512 | |||
3513 | lock_page_cgroup(pc); | ||
3514 | if (PageCgroupUsed(pc)) { | ||
3515 | memcg = pc->mem_cgroup; | ||
3516 | ClearPageCgroupUsed(pc); | ||
3517 | } | ||
3518 | unlock_page_cgroup(pc); | ||
3519 | |||
3520 | /* | ||
3521 | * We trust that only if there is a memcg associated with the page, it | ||
3522 | * is a valid allocation | ||
3523 | */ | ||
3524 | if (!memcg) | ||
3525 | return; | ||
3526 | |||
3527 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | ||
3528 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | ||
3529 | } | ||
3530 | #else | ||
3531 | static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | ||
3532 | { | ||
3533 | } | ||
3534 | #endif /* CONFIG_MEMCG_KMEM */ | ||
3535 | |||
2629 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 3536 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2630 | 3537 | ||
2631 | #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) | 3538 | #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) |
@@ -2709,13 +3616,6 @@ static int mem_cgroup_move_account(struct page *page, | |||
2709 | /* caller should have done css_get */ | 3616 | /* caller should have done css_get */ |
2710 | pc->mem_cgroup = to; | 3617 | pc->mem_cgroup = to; |
2711 | mem_cgroup_charge_statistics(to, anon, nr_pages); | 3618 | mem_cgroup_charge_statistics(to, anon, nr_pages); |
2712 | /* | ||
2713 | * We charges against "to" which may not have any tasks. Then, "to" | ||
2714 | * can be under rmdir(). But in current implementation, caller of | ||
2715 | * this function is just force_empty() and move charge, so it's | ||
2716 | * guaranteed that "to" is never removed. So, we don't check rmdir | ||
2717 | * status here. | ||
2718 | */ | ||
2719 | move_unlock_mem_cgroup(from, &flags); | 3619 | move_unlock_mem_cgroup(from, &flags); |
2720 | ret = 0; | 3620 | ret = 0; |
2721 | unlock: | 3621 | unlock: |
@@ -2729,10 +3629,27 @@ out: | |||
2729 | return ret; | 3629 | return ret; |
2730 | } | 3630 | } |
2731 | 3631 | ||
2732 | /* | 3632 | /** |
2733 | * move charges to its parent. | 3633 | * mem_cgroup_move_parent - moves page to the parent group |
3634 | * @page: the page to move | ||
3635 | * @pc: page_cgroup of the page | ||
3636 | * @child: page's cgroup | ||
3637 | * | ||
3638 | * move charges to its parent or the root cgroup if the group has no | ||
3639 | * parent (aka use_hierarchy==0). | ||
3640 | * Although this might fail (get_page_unless_zero, isolate_lru_page or | ||
3641 | * mem_cgroup_move_account fails) the failure is always temporary and | ||
3642 | * it signals a race with a page removal/uncharge or migration. In the | ||
3643 | * first case the page is on the way out and it will vanish from the LRU | ||
3644 | * on the next attempt and the call should be retried later. | ||
3645 | * Isolation from the LRU fails only if page has been isolated from | ||
3646 | * the LRU since we looked at it and that usually means either global | ||
3647 | * reclaim or migration going on. The page will either get back to the | ||
3648 | * LRU or vanish. | ||
3649 | * Finaly mem_cgroup_move_account fails only if the page got uncharged | ||
3650 | * (!PageCgroupUsed) or moved to a different group. The page will | ||
3651 | * disappear in the next attempt. | ||
2734 | */ | 3652 | */ |
2735 | |||
2736 | static int mem_cgroup_move_parent(struct page *page, | 3653 | static int mem_cgroup_move_parent(struct page *page, |
2737 | struct page_cgroup *pc, | 3654 | struct page_cgroup *pc, |
2738 | struct mem_cgroup *child) | 3655 | struct mem_cgroup *child) |
@@ -2742,9 +3659,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2742 | unsigned long uninitialized_var(flags); | 3659 | unsigned long uninitialized_var(flags); |
2743 | int ret; | 3660 | int ret; |
2744 | 3661 | ||
2745 | /* Is ROOT ? */ | 3662 | VM_BUG_ON(mem_cgroup_is_root(child)); |
2746 | if (mem_cgroup_is_root(child)) | ||
2747 | return -EINVAL; | ||
2748 | 3663 | ||
2749 | ret = -EBUSY; | 3664 | ret = -EBUSY; |
2750 | if (!get_page_unless_zero(page)) | 3665 | if (!get_page_unless_zero(page)) |
@@ -2761,8 +3676,10 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2761 | if (!parent) | 3676 | if (!parent) |
2762 | parent = root_mem_cgroup; | 3677 | parent = root_mem_cgroup; |
2763 | 3678 | ||
2764 | if (nr_pages > 1) | 3679 | if (nr_pages > 1) { |
3680 | VM_BUG_ON(!PageTransHuge(page)); | ||
2765 | flags = compound_lock_irqsave(page); | 3681 | flags = compound_lock_irqsave(page); |
3682 | } | ||
2766 | 3683 | ||
2767 | ret = mem_cgroup_move_account(page, nr_pages, | 3684 | ret = mem_cgroup_move_account(page, nr_pages, |
2768 | pc, child, parent); | 3685 | pc, child, parent); |
@@ -2904,7 +3821,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2904 | return; | 3821 | return; |
2905 | if (!memcg) | 3822 | if (!memcg) |
2906 | return; | 3823 | return; |
2907 | cgroup_exclude_rmdir(&memcg->css); | ||
2908 | 3824 | ||
2909 | __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); | 3825 | __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); |
2910 | /* | 3826 | /* |
@@ -2918,12 +3834,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2918 | swp_entry_t ent = {.val = page_private(page)}; | 3834 | swp_entry_t ent = {.val = page_private(page)}; |
2919 | mem_cgroup_uncharge_swap(ent); | 3835 | mem_cgroup_uncharge_swap(ent); |
2920 | } | 3836 | } |
2921 | /* | ||
2922 | * At swapin, we may charge account against cgroup which has no tasks. | ||
2923 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
2924 | * In that case, we need to call pre_destroy() again. check it here. | ||
2925 | */ | ||
2926 | cgroup_release_and_wakeup_rmdir(&memcg->css); | ||
2927 | } | 3837 | } |
2928 | 3838 | ||
2929 | void mem_cgroup_commit_charge_swapin(struct page *page, | 3839 | void mem_cgroup_commit_charge_swapin(struct page *page, |
@@ -3288,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3288 | struct mem_cgroup **memcgp) | 4198 | struct mem_cgroup **memcgp) |
3289 | { | 4199 | { |
3290 | struct mem_cgroup *memcg = NULL; | 4200 | struct mem_cgroup *memcg = NULL; |
4201 | unsigned int nr_pages = 1; | ||
3291 | struct page_cgroup *pc; | 4202 | struct page_cgroup *pc; |
3292 | enum charge_type ctype; | 4203 | enum charge_type ctype; |
3293 | 4204 | ||
3294 | *memcgp = NULL; | 4205 | *memcgp = NULL; |
3295 | 4206 | ||
3296 | VM_BUG_ON(PageTransHuge(page)); | ||
3297 | if (mem_cgroup_disabled()) | 4207 | if (mem_cgroup_disabled()) |
3298 | return; | 4208 | return; |
3299 | 4209 | ||
4210 | if (PageTransHuge(page)) | ||
4211 | nr_pages <<= compound_order(page); | ||
4212 | |||
3300 | pc = lookup_page_cgroup(page); | 4213 | pc = lookup_page_cgroup(page); |
3301 | lock_page_cgroup(pc); | 4214 | lock_page_cgroup(pc); |
3302 | if (PageCgroupUsed(pc)) { | 4215 | if (PageCgroupUsed(pc)) { |
@@ -3358,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3358 | * charged to the res_counter since we plan on replacing the | 4271 | * charged to the res_counter since we plan on replacing the |
3359 | * old one and only one page is going to be left afterwards. | 4272 | * old one and only one page is going to be left afterwards. |
3360 | */ | 4273 | */ |
3361 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | 4274 | __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); |
3362 | } | 4275 | } |
3363 | 4276 | ||
3364 | /* remove redundant charge if migration failed*/ | 4277 | /* remove redundant charge if migration failed*/ |
@@ -3371,8 +4284,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3371 | 4284 | ||
3372 | if (!memcg) | 4285 | if (!memcg) |
3373 | return; | 4286 | return; |
3374 | /* blocks rmdir() */ | 4287 | |
3375 | cgroup_exclude_rmdir(&memcg->css); | ||
3376 | if (!migration_ok) { | 4288 | if (!migration_ok) { |
3377 | used = oldpage; | 4289 | used = oldpage; |
3378 | unused = newpage; | 4290 | unused = newpage; |
@@ -3406,13 +4318,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3406 | */ | 4318 | */ |
3407 | if (anon) | 4319 | if (anon) |
3408 | mem_cgroup_uncharge_page(used); | 4320 | mem_cgroup_uncharge_page(used); |
3409 | /* | ||
3410 | * At migration, we may charge account against cgroup which has no | ||
3411 | * tasks. | ||
3412 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
3413 | * In that case, we need to call pre_destroy() again. check it here. | ||
3414 | */ | ||
3415 | cgroup_release_and_wakeup_rmdir(&memcg->css); | ||
3416 | } | 4321 | } |
3417 | 4322 | ||
3418 | /* | 4323 | /* |
@@ -3490,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page) | |||
3490 | } | 4395 | } |
3491 | #endif | 4396 | #endif |
3492 | 4397 | ||
3493 | static DEFINE_MUTEX(set_limit_mutex); | ||
3494 | |||
3495 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 4398 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
3496 | unsigned long long val) | 4399 | unsigned long long val) |
3497 | { | 4400 | { |
@@ -3712,17 +4615,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3712 | return nr_reclaimed; | 4615 | return nr_reclaimed; |
3713 | } | 4616 | } |
3714 | 4617 | ||
3715 | /* | 4618 | /** |
4619 | * mem_cgroup_force_empty_list - clears LRU of a group | ||
4620 | * @memcg: group to clear | ||
4621 | * @node: NUMA node | ||
4622 | * @zid: zone id | ||
4623 | * @lru: lru to to clear | ||
4624 | * | ||
3716 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't | 4625 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3717 | * reclaim the pages page themselves - it just removes the page_cgroups. | 4626 | * reclaim the pages page themselves - pages are moved to the parent (or root) |
3718 | * Returns true if some page_cgroups were not freed, indicating that the caller | 4627 | * group. |
3719 | * must retry this operation. | ||
3720 | */ | 4628 | */ |
3721 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 4629 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3722 | int node, int zid, enum lru_list lru) | 4630 | int node, int zid, enum lru_list lru) |
3723 | { | 4631 | { |
3724 | struct lruvec *lruvec; | 4632 | struct lruvec *lruvec; |
3725 | unsigned long flags, loop; | 4633 | unsigned long flags; |
3726 | struct list_head *list; | 4634 | struct list_head *list; |
3727 | struct page *busy; | 4635 | struct page *busy; |
3728 | struct zone *zone; | 4636 | struct zone *zone; |
@@ -3731,11 +4639,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3731 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 4639 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
3732 | list = &lruvec->lists[lru]; | 4640 | list = &lruvec->lists[lru]; |
3733 | 4641 | ||
3734 | loop = mem_cgroup_get_lru_size(lruvec, lru); | ||
3735 | /* give some margin against EBUSY etc...*/ | ||
3736 | loop += 256; | ||
3737 | busy = NULL; | 4642 | busy = NULL; |
3738 | while (loop--) { | 4643 | do { |
3739 | struct page_cgroup *pc; | 4644 | struct page_cgroup *pc; |
3740 | struct page *page; | 4645 | struct page *page; |
3741 | 4646 | ||
@@ -3761,76 +4666,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3761 | cond_resched(); | 4666 | cond_resched(); |
3762 | } else | 4667 | } else |
3763 | busy = NULL; | 4668 | busy = NULL; |
3764 | } | 4669 | } while (!list_empty(list)); |
3765 | return !list_empty(list); | ||
3766 | } | 4670 | } |
3767 | 4671 | ||
3768 | /* | 4672 | /* |
3769 | * make mem_cgroup's charge to be 0 if there is no task. | 4673 | * make mem_cgroup's charge to be 0 if there is no task by moving |
4674 | * all the charges and pages to the parent. | ||
3770 | * This enables deleting this mem_cgroup. | 4675 | * This enables deleting this mem_cgroup. |
4676 | * | ||
4677 | * Caller is responsible for holding css reference on the memcg. | ||
3771 | */ | 4678 | */ |
3772 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) | 4679 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) |
3773 | { | 4680 | { |
3774 | int ret; | 4681 | int node, zid; |
3775 | int node, zid, shrink; | 4682 | u64 usage; |
3776 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
3777 | struct cgroup *cgrp = memcg->css.cgroup; | ||
3778 | |||
3779 | css_get(&memcg->css); | ||
3780 | 4683 | ||
3781 | shrink = 0; | ||
3782 | /* should free all ? */ | ||
3783 | if (free_all) | ||
3784 | goto try_to_free; | ||
3785 | move_account: | ||
3786 | do { | 4684 | do { |
3787 | ret = -EBUSY; | ||
3788 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | ||
3789 | goto out; | ||
3790 | /* This is for making all *used* pages to be on LRU. */ | 4685 | /* This is for making all *used* pages to be on LRU. */ |
3791 | lru_add_drain_all(); | 4686 | lru_add_drain_all(); |
3792 | drain_all_stock_sync(memcg); | 4687 | drain_all_stock_sync(memcg); |
3793 | ret = 0; | ||
3794 | mem_cgroup_start_move(memcg); | 4688 | mem_cgroup_start_move(memcg); |
3795 | for_each_node_state(node, N_HIGH_MEMORY) { | 4689 | for_each_node_state(node, N_MEMORY) { |
3796 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 4690 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
3797 | enum lru_list lru; | 4691 | enum lru_list lru; |
3798 | for_each_lru(lru) { | 4692 | for_each_lru(lru) { |
3799 | ret = mem_cgroup_force_empty_list(memcg, | 4693 | mem_cgroup_force_empty_list(memcg, |
3800 | node, zid, lru); | 4694 | node, zid, lru); |
3801 | if (ret) | ||
3802 | break; | ||
3803 | } | 4695 | } |
3804 | } | 4696 | } |
3805 | if (ret) | ||
3806 | break; | ||
3807 | } | 4697 | } |
3808 | mem_cgroup_end_move(memcg); | 4698 | mem_cgroup_end_move(memcg); |
3809 | memcg_oom_recover(memcg); | 4699 | memcg_oom_recover(memcg); |
3810 | cond_resched(); | 4700 | cond_resched(); |
3811 | /* "ret" should also be checked to ensure all lists are empty. */ | ||
3812 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); | ||
3813 | out: | ||
3814 | css_put(&memcg->css); | ||
3815 | return ret; | ||
3816 | 4701 | ||
3817 | try_to_free: | 4702 | /* |
4703 | * Kernel memory may not necessarily be trackable to a specific | ||
4704 | * process. So they are not migrated, and therefore we can't | ||
4705 | * expect their value to drop to 0 here. | ||
4706 | * Having res filled up with kmem only is enough. | ||
4707 | * | ||
4708 | * This is a safety check because mem_cgroup_force_empty_list | ||
4709 | * could have raced with mem_cgroup_replace_page_cache callers | ||
4710 | * so the lru seemed empty but the page could have been added | ||
4711 | * right after the check. RES_USAGE should be safe as we always | ||
4712 | * charge before adding to the LRU. | ||
4713 | */ | ||
4714 | usage = res_counter_read_u64(&memcg->res, RES_USAGE) - | ||
4715 | res_counter_read_u64(&memcg->kmem, RES_USAGE); | ||
4716 | } while (usage > 0); | ||
4717 | } | ||
4718 | |||
4719 | /* | ||
4720 | * Reclaims as many pages from the given memcg as possible and moves | ||
4721 | * the rest to the parent. | ||
4722 | * | ||
4723 | * Caller is responsible for holding css reference for memcg. | ||
4724 | */ | ||
4725 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | ||
4726 | { | ||
4727 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
4728 | struct cgroup *cgrp = memcg->css.cgroup; | ||
4729 | |||
3818 | /* returns EBUSY if there is a task or if we come here twice. */ | 4730 | /* returns EBUSY if there is a task or if we come here twice. */ |
3819 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { | 4731 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3820 | ret = -EBUSY; | 4732 | return -EBUSY; |
3821 | goto out; | 4733 | |
3822 | } | ||
3823 | /* we call try-to-free pages for make this cgroup empty */ | 4734 | /* we call try-to-free pages for make this cgroup empty */ |
3824 | lru_add_drain_all(); | 4735 | lru_add_drain_all(); |
3825 | /* try to free all pages in this cgroup */ | 4736 | /* try to free all pages in this cgroup */ |
3826 | shrink = 1; | ||
3827 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { | 4737 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { |
3828 | int progress; | 4738 | int progress; |
3829 | 4739 | ||
3830 | if (signal_pending(current)) { | 4740 | if (signal_pending(current)) |
3831 | ret = -EINTR; | 4741 | return -EINTR; |
3832 | goto out; | 4742 | |
3833 | } | ||
3834 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, | 4743 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, |
3835 | false); | 4744 | false); |
3836 | if (!progress) { | 4745 | if (!progress) { |
@@ -3841,13 +4750,23 @@ try_to_free: | |||
3841 | 4750 | ||
3842 | } | 4751 | } |
3843 | lru_add_drain(); | 4752 | lru_add_drain(); |
3844 | /* try move_account...there may be some *locked* pages. */ | 4753 | mem_cgroup_reparent_charges(memcg); |
3845 | goto move_account; | 4754 | |
4755 | return 0; | ||
3846 | } | 4756 | } |
3847 | 4757 | ||
3848 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 4758 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
3849 | { | 4759 | { |
3850 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | 4760 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4761 | int ret; | ||
4762 | |||
4763 | if (mem_cgroup_is_root(memcg)) | ||
4764 | return -EINVAL; | ||
4765 | css_get(&memcg->css); | ||
4766 | ret = mem_cgroup_force_empty(memcg); | ||
4767 | css_put(&memcg->css); | ||
4768 | |||
4769 | return ret; | ||
3851 | } | 4770 | } |
3852 | 4771 | ||
3853 | 4772 | ||
@@ -3938,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |||
3938 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4857 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3939 | char str[64]; | 4858 | char str[64]; |
3940 | u64 val; | 4859 | u64 val; |
3941 | int type, name, len; | 4860 | int name, len; |
4861 | enum res_type type; | ||
3942 | 4862 | ||
3943 | type = MEMFILE_TYPE(cft->private); | 4863 | type = MEMFILE_TYPE(cft->private); |
3944 | name = MEMFILE_ATTR(cft->private); | 4864 | name = MEMFILE_ATTR(cft->private); |
@@ -3959,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |||
3959 | else | 4879 | else |
3960 | val = res_counter_read_u64(&memcg->memsw, name); | 4880 | val = res_counter_read_u64(&memcg->memsw, name); |
3961 | break; | 4881 | break; |
4882 | case _KMEM: | ||
4883 | val = res_counter_read_u64(&memcg->kmem, name); | ||
4884 | break; | ||
3962 | default: | 4885 | default: |
3963 | BUG(); | 4886 | BUG(); |
3964 | } | 4887 | } |
@@ -3966,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |||
3966 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | 4889 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); |
3967 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | 4890 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); |
3968 | } | 4891 | } |
4892 | |||
4893 | static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | ||
4894 | { | ||
4895 | int ret = -EINVAL; | ||
4896 | #ifdef CONFIG_MEMCG_KMEM | ||
4897 | bool must_inc_static_branch = false; | ||
4898 | |||
4899 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | ||
4900 | /* | ||
4901 | * For simplicity, we won't allow this to be disabled. It also can't | ||
4902 | * be changed if the cgroup has children already, or if tasks had | ||
4903 | * already joined. | ||
4904 | * | ||
4905 | * If tasks join before we set the limit, a person looking at | ||
4906 | * kmem.usage_in_bytes will have no way to determine when it took | ||
4907 | * place, which makes the value quite meaningless. | ||
4908 | * | ||
4909 | * After it first became limited, changes in the value of the limit are | ||
4910 | * of course permitted. | ||
4911 | * | ||
4912 | * Taking the cgroup_lock is really offensive, but it is so far the only | ||
4913 | * way to guarantee that no children will appear. There are plenty of | ||
4914 | * other offenders, and they should all go away. Fine grained locking | ||
4915 | * is probably the way to go here. When we are fully hierarchical, we | ||
4916 | * can also get rid of the use_hierarchy check. | ||
4917 | */ | ||
4918 | cgroup_lock(); | ||
4919 | mutex_lock(&set_limit_mutex); | ||
4920 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | ||
4921 | if (cgroup_task_count(cont) || (memcg->use_hierarchy && | ||
4922 | !list_empty(&cont->children))) { | ||
4923 | ret = -EBUSY; | ||
4924 | goto out; | ||
4925 | } | ||
4926 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
4927 | VM_BUG_ON(ret); | ||
4928 | |||
4929 | ret = memcg_update_cache_sizes(memcg); | ||
4930 | if (ret) { | ||
4931 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | ||
4932 | goto out; | ||
4933 | } | ||
4934 | must_inc_static_branch = true; | ||
4935 | /* | ||
4936 | * kmem charges can outlive the cgroup. In the case of slab | ||
4937 | * pages, for instance, a page contain objects from various | ||
4938 | * processes, so it is unfeasible to migrate them away. We | ||
4939 | * need to reference count the memcg because of that. | ||
4940 | */ | ||
4941 | mem_cgroup_get(memcg); | ||
4942 | } else | ||
4943 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
4944 | out: | ||
4945 | mutex_unlock(&set_limit_mutex); | ||
4946 | cgroup_unlock(); | ||
4947 | |||
4948 | /* | ||
4949 | * We are by now familiar with the fact that we can't inc the static | ||
4950 | * branch inside cgroup_lock. See disarm functions for details. A | ||
4951 | * worker here is overkill, but also wrong: After the limit is set, we | ||
4952 | * must start accounting right away. Since this operation can't fail, | ||
4953 | * we can safely defer it to here - no rollback will be needed. | ||
4954 | * | ||
4955 | * The boolean used to control this is also safe, because | ||
4956 | * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be | ||
4957 | * able to set it to true; | ||
4958 | */ | ||
4959 | if (must_inc_static_branch) { | ||
4960 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
4961 | /* | ||
4962 | * setting the active bit after the inc will guarantee no one | ||
4963 | * starts accounting before all call sites are patched | ||
4964 | */ | ||
4965 | memcg_kmem_set_active(memcg); | ||
4966 | } | ||
4967 | |||
4968 | #endif | ||
4969 | return ret; | ||
4970 | } | ||
4971 | |||
4972 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | ||
4973 | { | ||
4974 | int ret = 0; | ||
4975 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | ||
4976 | if (!parent) | ||
4977 | goto out; | ||
4978 | |||
4979 | memcg->kmem_account_flags = parent->kmem_account_flags; | ||
4980 | #ifdef CONFIG_MEMCG_KMEM | ||
4981 | /* | ||
4982 | * When that happen, we need to disable the static branch only on those | ||
4983 | * memcgs that enabled it. To achieve this, we would be forced to | ||
4984 | * complicate the code by keeping track of which memcgs were the ones | ||
4985 | * that actually enabled limits, and which ones got it from its | ||
4986 | * parents. | ||
4987 | * | ||
4988 | * It is a lot simpler just to do static_key_slow_inc() on every child | ||
4989 | * that is accounted. | ||
4990 | */ | ||
4991 | if (!memcg_kmem_is_active(memcg)) | ||
4992 | goto out; | ||
4993 | |||
4994 | /* | ||
4995 | * destroy(), called if we fail, will issue static_key_slow_inc() and | ||
4996 | * mem_cgroup_put() if kmem is enabled. We have to either call them | ||
4997 | * unconditionally, or clear the KMEM_ACTIVE flag. I personally find | ||
4998 | * this more consistent, since it always leads to the same destroy path | ||
4999 | */ | ||
5000 | mem_cgroup_get(memcg); | ||
5001 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
5002 | |||
5003 | mutex_lock(&set_limit_mutex); | ||
5004 | ret = memcg_update_cache_sizes(memcg); | ||
5005 | mutex_unlock(&set_limit_mutex); | ||
5006 | #endif | ||
5007 | out: | ||
5008 | return ret; | ||
5009 | } | ||
5010 | |||
3969 | /* | 5011 | /* |
3970 | * The user of this function is... | 5012 | * The user of this function is... |
3971 | * RES_LIMIT. | 5013 | * RES_LIMIT. |
@@ -3974,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
3974 | const char *buffer) | 5016 | const char *buffer) |
3975 | { | 5017 | { |
3976 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5018 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3977 | int type, name; | 5019 | enum res_type type; |
5020 | int name; | ||
3978 | unsigned long long val; | 5021 | unsigned long long val; |
3979 | int ret; | 5022 | int ret; |
3980 | 5023 | ||
@@ -3996,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
3996 | break; | 5039 | break; |
3997 | if (type == _MEM) | 5040 | if (type == _MEM) |
3998 | ret = mem_cgroup_resize_limit(memcg, val); | 5041 | ret = mem_cgroup_resize_limit(memcg, val); |
3999 | else | 5042 | else if (type == _MEMSWAP) |
4000 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 5043 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
5044 | else if (type == _KMEM) | ||
5045 | ret = memcg_update_kmem_limit(cont, val); | ||
5046 | else | ||
5047 | return -EINVAL; | ||
4001 | break; | 5048 | break; |
4002 | case RES_SOFT_LIMIT: | 5049 | case RES_SOFT_LIMIT: |
4003 | ret = res_counter_memparse_write_strategy(buffer, &val); | 5050 | ret = res_counter_memparse_write_strategy(buffer, &val); |
@@ -4050,7 +5097,8 @@ out: | |||
4050 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 5097 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
4051 | { | 5098 | { |
4052 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5099 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4053 | int type, name; | 5100 | int name; |
5101 | enum res_type type; | ||
4054 | 5102 | ||
4055 | type = MEMFILE_TYPE(event); | 5103 | type = MEMFILE_TYPE(event); |
4056 | name = MEMFILE_ATTR(event); | 5104 | name = MEMFILE_ATTR(event); |
@@ -4062,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
4062 | case RES_MAX_USAGE: | 5110 | case RES_MAX_USAGE: |
4063 | if (type == _MEM) | 5111 | if (type == _MEM) |
4064 | res_counter_reset_max(&memcg->res); | 5112 | res_counter_reset_max(&memcg->res); |
4065 | else | 5113 | else if (type == _MEMSWAP) |
4066 | res_counter_reset_max(&memcg->memsw); | 5114 | res_counter_reset_max(&memcg->memsw); |
5115 | else if (type == _KMEM) | ||
5116 | res_counter_reset_max(&memcg->kmem); | ||
5117 | else | ||
5118 | return -EINVAL; | ||
4067 | break; | 5119 | break; |
4068 | case RES_FAILCNT: | 5120 | case RES_FAILCNT: |
4069 | if (type == _MEM) | 5121 | if (type == _MEM) |
4070 | res_counter_reset_failcnt(&memcg->res); | 5122 | res_counter_reset_failcnt(&memcg->res); |
4071 | else | 5123 | else if (type == _MEMSWAP) |
4072 | res_counter_reset_failcnt(&memcg->memsw); | 5124 | res_counter_reset_failcnt(&memcg->memsw); |
5125 | else if (type == _KMEM) | ||
5126 | res_counter_reset_failcnt(&memcg->kmem); | ||
5127 | else | ||
5128 | return -EINVAL; | ||
4073 | break; | 5129 | break; |
4074 | } | 5130 | } |
4075 | 5131 | ||
@@ -4120,7 +5176,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4120 | 5176 | ||
4121 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 5177 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
4122 | seq_printf(m, "total=%lu", total_nr); | 5178 | seq_printf(m, "total=%lu", total_nr); |
4123 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5179 | for_each_node_state(nid, N_MEMORY) { |
4124 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); | 5180 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); |
4125 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5181 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4126 | } | 5182 | } |
@@ -4128,7 +5184,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4128 | 5184 | ||
4129 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); | 5185 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); |
4130 | seq_printf(m, "file=%lu", file_nr); | 5186 | seq_printf(m, "file=%lu", file_nr); |
4131 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5187 | for_each_node_state(nid, N_MEMORY) { |
4132 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5188 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4133 | LRU_ALL_FILE); | 5189 | LRU_ALL_FILE); |
4134 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5190 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4137,7 +5193,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4137 | 5193 | ||
4138 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); | 5194 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); |
4139 | seq_printf(m, "anon=%lu", anon_nr); | 5195 | seq_printf(m, "anon=%lu", anon_nr); |
4140 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5196 | for_each_node_state(nid, N_MEMORY) { |
4141 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5197 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4142 | LRU_ALL_ANON); | 5198 | LRU_ALL_ANON); |
4143 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5199 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4146,7 +5202,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4146 | 5202 | ||
4147 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | 5203 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4148 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 5204 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4149 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5205 | for_each_node_state(nid, N_MEMORY) { |
4150 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5206 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4151 | BIT(LRU_UNEVICTABLE)); | 5207 | BIT(LRU_UNEVICTABLE)); |
4152 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5208 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4386,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | |||
4386 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5442 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4387 | struct mem_cgroup_thresholds *thresholds; | 5443 | struct mem_cgroup_thresholds *thresholds; |
4388 | struct mem_cgroup_threshold_ary *new; | 5444 | struct mem_cgroup_threshold_ary *new; |
4389 | int type = MEMFILE_TYPE(cft->private); | 5445 | enum res_type type = MEMFILE_TYPE(cft->private); |
4390 | u64 threshold, usage; | 5446 | u64 threshold, usage; |
4391 | int i, size, ret; | 5447 | int i, size, ret; |
4392 | 5448 | ||
@@ -4469,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4469 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5525 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4470 | struct mem_cgroup_thresholds *thresholds; | 5526 | struct mem_cgroup_thresholds *thresholds; |
4471 | struct mem_cgroup_threshold_ary *new; | 5527 | struct mem_cgroup_threshold_ary *new; |
4472 | int type = MEMFILE_TYPE(cft->private); | 5528 | enum res_type type = MEMFILE_TYPE(cft->private); |
4473 | u64 usage; | 5529 | u64 usage; |
4474 | int i, j, size; | 5530 | int i, j, size; |
4475 | 5531 | ||
@@ -4547,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | |||
4547 | { | 5603 | { |
4548 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5604 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4549 | struct mem_cgroup_eventfd_list *event; | 5605 | struct mem_cgroup_eventfd_list *event; |
4550 | int type = MEMFILE_TYPE(cft->private); | 5606 | enum res_type type = MEMFILE_TYPE(cft->private); |
4551 | 5607 | ||
4552 | BUG_ON(type != _OOM_TYPE); | 5608 | BUG_ON(type != _OOM_TYPE); |
4553 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 5609 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
@@ -4572,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4572 | { | 5628 | { |
4573 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5629 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4574 | struct mem_cgroup_eventfd_list *ev, *tmp; | 5630 | struct mem_cgroup_eventfd_list *ev, *tmp; |
4575 | int type = MEMFILE_TYPE(cft->private); | 5631 | enum res_type type = MEMFILE_TYPE(cft->private); |
4576 | 5632 | ||
4577 | BUG_ON(type != _OOM_TYPE); | 5633 | BUG_ON(type != _OOM_TYPE); |
4578 | 5634 | ||
@@ -4631,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4631 | #ifdef CONFIG_MEMCG_KMEM | 5687 | #ifdef CONFIG_MEMCG_KMEM |
4632 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 5688 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4633 | { | 5689 | { |
5690 | int ret; | ||
5691 | |||
5692 | memcg->kmemcg_id = -1; | ||
5693 | ret = memcg_propagate_kmem(memcg); | ||
5694 | if (ret) | ||
5695 | return ret; | ||
5696 | |||
4634 | return mem_cgroup_sockets_init(memcg, ss); | 5697 | return mem_cgroup_sockets_init(memcg, ss); |
4635 | }; | 5698 | }; |
4636 | 5699 | ||
4637 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) | 5700 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) |
4638 | { | 5701 | { |
4639 | mem_cgroup_sockets_destroy(memcg); | 5702 | mem_cgroup_sockets_destroy(memcg); |
5703 | |||
5704 | memcg_kmem_mark_dead(memcg); | ||
5705 | |||
5706 | if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) | ||
5707 | return; | ||
5708 | |||
5709 | /* | ||
5710 | * Charges already down to 0, undo mem_cgroup_get() done in the charge | ||
5711 | * path here, being careful not to race with memcg_uncharge_kmem: it is | ||
5712 | * possible that the charges went down to 0 between mark_dead and the | ||
5713 | * res_counter read, so in that case, we don't need the put | ||
5714 | */ | ||
5715 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
5716 | mem_cgroup_put(memcg); | ||
4640 | } | 5717 | } |
4641 | #else | 5718 | #else |
4642 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 5719 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
@@ -4745,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = { | |||
4745 | .read = mem_cgroup_read, | 5822 | .read = mem_cgroup_read, |
4746 | }, | 5823 | }, |
4747 | #endif | 5824 | #endif |
5825 | #ifdef CONFIG_MEMCG_KMEM | ||
5826 | { | ||
5827 | .name = "kmem.limit_in_bytes", | ||
5828 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | ||
5829 | .write_string = mem_cgroup_write, | ||
5830 | .read = mem_cgroup_read, | ||
5831 | }, | ||
5832 | { | ||
5833 | .name = "kmem.usage_in_bytes", | ||
5834 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | ||
5835 | .read = mem_cgroup_read, | ||
5836 | }, | ||
5837 | { | ||
5838 | .name = "kmem.failcnt", | ||
5839 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), | ||
5840 | .trigger = mem_cgroup_reset, | ||
5841 | .read = mem_cgroup_read, | ||
5842 | }, | ||
5843 | { | ||
5844 | .name = "kmem.max_usage_in_bytes", | ||
5845 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), | ||
5846 | .trigger = mem_cgroup_reset, | ||
5847 | .read = mem_cgroup_read, | ||
5848 | }, | ||
5849 | #ifdef CONFIG_SLABINFO | ||
5850 | { | ||
5851 | .name = "kmem.slabinfo", | ||
5852 | .read_seq_string = mem_cgroup_slabinfo_read, | ||
5853 | }, | ||
5854 | #endif | ||
5855 | #endif | ||
4748 | { }, /* terminate */ | 5856 | { }, /* terminate */ |
4749 | }; | 5857 | }; |
4750 | 5858 | ||
@@ -4812,16 +5920,29 @@ out_free: | |||
4812 | } | 5920 | } |
4813 | 5921 | ||
4814 | /* | 5922 | /* |
4815 | * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, | 5923 | * At destroying mem_cgroup, references from swap_cgroup can remain. |
4816 | * but in process context. The work_freeing structure is overlaid | 5924 | * (scanning all at force_empty is too costly...) |
4817 | * on the rcu_freeing structure, which itself is overlaid on memsw. | 5925 | * |
5926 | * Instead of clearing all references at force_empty, we remember | ||
5927 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
5928 | * it goes down to 0. | ||
5929 | * | ||
5930 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
4818 | */ | 5931 | */ |
4819 | static void free_work(struct work_struct *work) | 5932 | |
5933 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | ||
4820 | { | 5934 | { |
4821 | struct mem_cgroup *memcg; | 5935 | int node; |
4822 | int size = sizeof(struct mem_cgroup); | 5936 | int size = sizeof(struct mem_cgroup); |
4823 | 5937 | ||
4824 | memcg = container_of(work, struct mem_cgroup, work_freeing); | 5938 | mem_cgroup_remove_from_trees(memcg); |
5939 | free_css_id(&mem_cgroup_subsys, &memcg->css); | ||
5940 | |||
5941 | for_each_node(node) | ||
5942 | free_mem_cgroup_per_zone_info(memcg, node); | ||
5943 | |||
5944 | free_percpu(memcg->stat); | ||
5945 | |||
4825 | /* | 5946 | /* |
4826 | * We need to make sure that (at least for now), the jump label | 5947 | * We need to make sure that (at least for now), the jump label |
4827 | * destruction code runs outside of the cgroup lock. This is because | 5948 | * destruction code runs outside of the cgroup lock. This is because |
@@ -4833,45 +5954,34 @@ static void free_work(struct work_struct *work) | |||
4833 | * to move this code around, and make sure it is outside | 5954 | * to move this code around, and make sure it is outside |
4834 | * the cgroup_lock. | 5955 | * the cgroup_lock. |
4835 | */ | 5956 | */ |
4836 | disarm_sock_keys(memcg); | 5957 | disarm_static_keys(memcg); |
4837 | if (size < PAGE_SIZE) | 5958 | if (size < PAGE_SIZE) |
4838 | kfree(memcg); | 5959 | kfree(memcg); |
4839 | else | 5960 | else |
4840 | vfree(memcg); | 5961 | vfree(memcg); |
4841 | } | 5962 | } |
4842 | 5963 | ||
4843 | static void free_rcu(struct rcu_head *rcu_head) | ||
4844 | { | ||
4845 | struct mem_cgroup *memcg; | ||
4846 | |||
4847 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); | ||
4848 | INIT_WORK(&memcg->work_freeing, free_work); | ||
4849 | schedule_work(&memcg->work_freeing); | ||
4850 | } | ||
4851 | 5964 | ||
4852 | /* | 5965 | /* |
4853 | * At destroying mem_cgroup, references from swap_cgroup can remain. | 5966 | * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, |
4854 | * (scanning all at force_empty is too costly...) | 5967 | * but in process context. The work_freeing structure is overlaid |
4855 | * | 5968 | * on the rcu_freeing structure, which itself is overlaid on memsw. |
4856 | * Instead of clearing all references at force_empty, we remember | ||
4857 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
4858 | * it goes down to 0. | ||
4859 | * | ||
4860 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
4861 | */ | 5969 | */ |
4862 | 5970 | static void free_work(struct work_struct *work) | |
4863 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | ||
4864 | { | 5971 | { |
4865 | int node; | 5972 | struct mem_cgroup *memcg; |
4866 | 5973 | ||
4867 | mem_cgroup_remove_from_trees(memcg); | 5974 | memcg = container_of(work, struct mem_cgroup, work_freeing); |
4868 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5975 | __mem_cgroup_free(memcg); |
5976 | } | ||
4869 | 5977 | ||
4870 | for_each_node(node) | 5978 | static void free_rcu(struct rcu_head *rcu_head) |
4871 | free_mem_cgroup_per_zone_info(memcg, node); | 5979 | { |
5980 | struct mem_cgroup *memcg; | ||
4872 | 5981 | ||
4873 | free_percpu(memcg->stat); | 5982 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); |
4874 | call_rcu(&memcg->rcu_freeing, free_rcu); | 5983 | INIT_WORK(&memcg->work_freeing, free_work); |
5984 | schedule_work(&memcg->work_freeing); | ||
4875 | } | 5985 | } |
4876 | 5986 | ||
4877 | static void mem_cgroup_get(struct mem_cgroup *memcg) | 5987 | static void mem_cgroup_get(struct mem_cgroup *memcg) |
@@ -4883,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) | |||
4883 | { | 5993 | { |
4884 | if (atomic_sub_and_test(count, &memcg->refcnt)) { | 5994 | if (atomic_sub_and_test(count, &memcg->refcnt)) { |
4885 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | 5995 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
4886 | __mem_cgroup_free(memcg); | 5996 | call_rcu(&memcg->rcu_freeing, free_rcu); |
4887 | if (parent) | 5997 | if (parent) |
4888 | mem_cgroup_put(parent); | 5998 | mem_cgroup_put(parent); |
4889 | } | 5999 | } |
@@ -4953,7 +6063,7 @@ err_cleanup: | |||
4953 | } | 6063 | } |
4954 | 6064 | ||
4955 | static struct cgroup_subsys_state * __ref | 6065 | static struct cgroup_subsys_state * __ref |
4956 | mem_cgroup_create(struct cgroup *cont) | 6066 | mem_cgroup_css_alloc(struct cgroup *cont) |
4957 | { | 6067 | { |
4958 | struct mem_cgroup *memcg, *parent; | 6068 | struct mem_cgroup *memcg, *parent; |
4959 | long error = -ENOMEM; | 6069 | long error = -ENOMEM; |
@@ -4980,7 +6090,6 @@ mem_cgroup_create(struct cgroup *cont) | |||
4980 | &per_cpu(memcg_stock, cpu); | 6090 | &per_cpu(memcg_stock, cpu); |
4981 | INIT_WORK(&stock->work, drain_local_stock); | 6091 | INIT_WORK(&stock->work, drain_local_stock); |
4982 | } | 6092 | } |
4983 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
4984 | } else { | 6093 | } else { |
4985 | parent = mem_cgroup_from_cont(cont->parent); | 6094 | parent = mem_cgroup_from_cont(cont->parent); |
4986 | memcg->use_hierarchy = parent->use_hierarchy; | 6095 | memcg->use_hierarchy = parent->use_hierarchy; |
@@ -4990,6 +6099,8 @@ mem_cgroup_create(struct cgroup *cont) | |||
4990 | if (parent && parent->use_hierarchy) { | 6099 | if (parent && parent->use_hierarchy) { |
4991 | res_counter_init(&memcg->res, &parent->res); | 6100 | res_counter_init(&memcg->res, &parent->res); |
4992 | res_counter_init(&memcg->memsw, &parent->memsw); | 6101 | res_counter_init(&memcg->memsw, &parent->memsw); |
6102 | res_counter_init(&memcg->kmem, &parent->kmem); | ||
6103 | |||
4993 | /* | 6104 | /* |
4994 | * We increment refcnt of the parent to ensure that we can | 6105 | * We increment refcnt of the parent to ensure that we can |
4995 | * safely access it on res_counter_charge/uncharge. | 6106 | * safely access it on res_counter_charge/uncharge. |
@@ -5000,6 +6111,7 @@ mem_cgroup_create(struct cgroup *cont) | |||
5000 | } else { | 6111 | } else { |
5001 | res_counter_init(&memcg->res, NULL); | 6112 | res_counter_init(&memcg->res, NULL); |
5002 | res_counter_init(&memcg->memsw, NULL); | 6113 | res_counter_init(&memcg->memsw, NULL); |
6114 | res_counter_init(&memcg->kmem, NULL); | ||
5003 | /* | 6115 | /* |
5004 | * Deeper hierachy with use_hierarchy == false doesn't make | 6116 | * Deeper hierachy with use_hierarchy == false doesn't make |
5005 | * much sense so let cgroup subsystem know about this | 6117 | * much sense so let cgroup subsystem know about this |
@@ -5034,14 +6146,15 @@ free_out: | |||
5034 | return ERR_PTR(error); | 6146 | return ERR_PTR(error); |
5035 | } | 6147 | } |
5036 | 6148 | ||
5037 | static int mem_cgroup_pre_destroy(struct cgroup *cont) | 6149 | static void mem_cgroup_css_offline(struct cgroup *cont) |
5038 | { | 6150 | { |
5039 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 6151 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5040 | 6152 | ||
5041 | return mem_cgroup_force_empty(memcg, false); | 6153 | mem_cgroup_reparent_charges(memcg); |
6154 | mem_cgroup_destroy_all_caches(memcg); | ||
5042 | } | 6155 | } |
5043 | 6156 | ||
5044 | static void mem_cgroup_destroy(struct cgroup *cont) | 6157 | static void mem_cgroup_css_free(struct cgroup *cont) |
5045 | { | 6158 | { |
5046 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 6159 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5047 | 6160 | ||
@@ -5631,18 +6744,30 @@ static void mem_cgroup_move_task(struct cgroup *cont, | |||
5631 | struct cgroup_subsys mem_cgroup_subsys = { | 6744 | struct cgroup_subsys mem_cgroup_subsys = { |
5632 | .name = "memory", | 6745 | .name = "memory", |
5633 | .subsys_id = mem_cgroup_subsys_id, | 6746 | .subsys_id = mem_cgroup_subsys_id, |
5634 | .create = mem_cgroup_create, | 6747 | .css_alloc = mem_cgroup_css_alloc, |
5635 | .pre_destroy = mem_cgroup_pre_destroy, | 6748 | .css_offline = mem_cgroup_css_offline, |
5636 | .destroy = mem_cgroup_destroy, | 6749 | .css_free = mem_cgroup_css_free, |
5637 | .can_attach = mem_cgroup_can_attach, | 6750 | .can_attach = mem_cgroup_can_attach, |
5638 | .cancel_attach = mem_cgroup_cancel_attach, | 6751 | .cancel_attach = mem_cgroup_cancel_attach, |
5639 | .attach = mem_cgroup_move_task, | 6752 | .attach = mem_cgroup_move_task, |
5640 | .base_cftypes = mem_cgroup_files, | 6753 | .base_cftypes = mem_cgroup_files, |
5641 | .early_init = 0, | 6754 | .early_init = 0, |
5642 | .use_id = 1, | 6755 | .use_id = 1, |
5643 | .__DEPRECATED_clear_css_refs = true, | ||
5644 | }; | 6756 | }; |
5645 | 6757 | ||
6758 | /* | ||
6759 | * The rest of init is performed during ->css_alloc() for root css which | ||
6760 | * happens before initcalls. hotcpu_notifier() can't be done together as | ||
6761 | * it would introduce circular locking by adding cgroup_lock -> cpu hotplug | ||
6762 | * dependency. Do it from a subsys_initcall(). | ||
6763 | */ | ||
6764 | static int __init mem_cgroup_init(void) | ||
6765 | { | ||
6766 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
6767 | return 0; | ||
6768 | } | ||
6769 | subsys_initcall(mem_cgroup_init); | ||
6770 | |||
5646 | #ifdef CONFIG_MEMCG_SWAP | 6771 | #ifdef CONFIG_MEMCG_SWAP |
5647 | static int __init enable_swap_account(char *s) | 6772 | static int __init enable_swap_account(char *s) |
5648 | { | 6773 | { |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6c5899b9034a..c6e4dd3e1c08 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
402 | struct anon_vma *av; | 402 | struct anon_vma *av; |
403 | pgoff_t pgoff; | 403 | pgoff_t pgoff; |
404 | 404 | ||
405 | av = page_lock_anon_vma(page); | 405 | av = page_lock_anon_vma_read(page); |
406 | if (av == NULL) /* Not actually mapped anymore */ | 406 | if (av == NULL) /* Not actually mapped anymore */ |
407 | return; | 407 | return; |
408 | 408 | ||
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
423 | } | 423 | } |
424 | } | 424 | } |
425 | read_unlock(&tasklist_lock); | 425 | read_unlock(&tasklist_lock); |
426 | page_unlock_anon_vma(av); | 426 | page_unlock_anon_vma_read(av); |
427 | } | 427 | } |
428 | 428 | ||
429 | /* | 429 | /* |
@@ -781,16 +781,16 @@ static struct page_state { | |||
781 | { compound, compound, "huge", me_huge_page }, | 781 | { compound, compound, "huge", me_huge_page }, |
782 | #endif | 782 | #endif |
783 | 783 | ||
784 | { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, | 784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, |
785 | { sc|dirty, sc, "swapcache", me_swapcache_clean }, | 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
786 | 786 | ||
787 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | 787 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, |
788 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | 788 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, |
789 | 789 | ||
790 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | 790 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
791 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | 791 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, |
792 | 792 | ||
793 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
795 | 795 | ||
796 | /* | 796 | /* |
@@ -812,14 +812,14 @@ static struct page_state { | |||
812 | #undef slab | 812 | #undef slab |
813 | #undef reserved | 813 | #undef reserved |
814 | 814 | ||
815 | /* | ||
816 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | ||
817 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | ||
818 | */ | ||
815 | static void action_result(unsigned long pfn, char *msg, int result) | 819 | static void action_result(unsigned long pfn, char *msg, int result) |
816 | { | 820 | { |
817 | struct page *page = pfn_to_page(pfn); | 821 | pr_err("MCE %#lx: %s page recovery: %s\n", |
818 | 822 | pfn, msg, action_name[result]); | |
819 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | ||
820 | pfn, | ||
821 | PageDirty(page) ? "dirty " : "", | ||
822 | msg, action_name[result]); | ||
823 | } | 823 | } |
824 | 824 | ||
825 | static int page_action(struct page_state *ps, struct page *p, | 825 | static int page_action(struct page_state *ps, struct page *p, |
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1385 | * Isolate the page, so that it doesn't get reallocated if it | 1385 | * Isolate the page, so that it doesn't get reallocated if it |
1386 | * was free. | 1386 | * was free. |
1387 | */ | 1387 | */ |
1388 | set_migratetype_isolate(p); | 1388 | set_migratetype_isolate(p, true); |
1389 | /* | 1389 | /* |
1390 | * When the target page is a free hugepage, just remove it | 1390 | * When the target page is a free hugepage, just remove it |
1391 | * from free hugepage list. | 1391 | * from free hugepage list. |
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags) | |||
1476 | { | 1476 | { |
1477 | int ret; | 1477 | int ret; |
1478 | unsigned long pfn = page_to_pfn(page); | 1478 | unsigned long pfn = page_to_pfn(page); |
1479 | struct page *hpage = compound_trans_head(page); | ||
1479 | 1480 | ||
1480 | if (PageHuge(page)) | 1481 | if (PageHuge(page)) |
1481 | return soft_offline_huge_page(page, flags); | 1482 | return soft_offline_huge_page(page, flags); |
1483 | if (PageTransHuge(hpage)) { | ||
1484 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | ||
1485 | pr_info("soft offline: %#lx: failed to split THP\n", | ||
1486 | pfn); | ||
1487 | return -EBUSY; | ||
1488 | } | ||
1489 | } | ||
1482 | 1490 | ||
1483 | ret = get_any_page(page, pfn, flags); | 1491 | ret = get_any_page(page, pfn, flags); |
1484 | if (ret < 0) | 1492 | if (ret < 0) |
@@ -1558,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1558 | page_is_file_cache(page)); | 1566 | page_is_file_cache(page)); |
1559 | list_add(&page->lru, &pagelist); | 1567 | list_add(&page->lru, &pagelist); |
1560 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1561 | false, MIGRATE_SYNC); | 1569 | false, MIGRATE_SYNC, |
1570 | MR_MEMORY_FAILURE); | ||
1562 | if (ret) { | 1571 | if (ret) { |
1563 | putback_lru_pages(&pagelist); | 1572 | putback_lru_pages(&pagelist); |
1564 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1573 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index 221fc9ffcab1..bb1369f7b9b4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -57,6 +57,8 @@ | |||
57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | ||
61 | #include <linux/string.h> | ||
60 | 62 | ||
61 | #include <asm/io.h> | 63 | #include <asm/io.h> |
62 | #include <asm/pgalloc.h> | 64 | #include <asm/pgalloc.h> |
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) | |||
182 | return 1; | 184 | return 1; |
183 | } | 185 | } |
184 | 186 | ||
187 | if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) | ||
188 | return 0; | ||
189 | |||
185 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); | 190 | batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); |
186 | if (!batch) | 191 | if (!batch) |
187 | return 0; | 192 | return 0; |
188 | 193 | ||
194 | tlb->batch_count++; | ||
189 | batch->next = NULL; | 195 | batch->next = NULL; |
190 | batch->nr = 0; | 196 | batch->nr = 0; |
191 | batch->max = MAX_GATHER_BATCH; | 197 | batch->max = MAX_GATHER_BATCH; |
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) | |||
214 | tlb->local.nr = 0; | 220 | tlb->local.nr = 0; |
215 | tlb->local.max = ARRAY_SIZE(tlb->__pages); | 221 | tlb->local.max = ARRAY_SIZE(tlb->__pages); |
216 | tlb->active = &tlb->local; | 222 | tlb->active = &tlb->local; |
223 | tlb->batch_count = 0; | ||
217 | 224 | ||
218 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 225 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
219 | tlb->batch = NULL; | 226 | tlb->batch = NULL; |
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags) | |||
717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 724 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
718 | } | 725 | } |
719 | 726 | ||
720 | #ifndef is_zero_pfn | ||
721 | static inline int is_zero_pfn(unsigned long pfn) | ||
722 | { | ||
723 | return pfn == zero_pfn; | ||
724 | } | ||
725 | #endif | ||
726 | |||
727 | #ifndef my_zero_pfn | ||
728 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
729 | { | ||
730 | return zero_pfn; | ||
731 | } | ||
732 | #endif | ||
733 | |||
734 | /* | 727 | /* |
735 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 728 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
736 | * | 729 | * |
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1250 | BUG(); | 1243 | BUG(); |
1251 | } | 1244 | } |
1252 | #endif | 1245 | #endif |
1253 | split_huge_page_pmd(vma->vm_mm, pmd); | 1246 | split_huge_page_pmd(vma, addr, pmd); |
1254 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1247 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1255 | goto next; | 1248 | goto next; |
1256 | /* fall through */ | 1249 | /* fall through */ |
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1517 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1510 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1518 | goto out; | 1511 | goto out; |
1519 | } | 1512 | } |
1513 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
1514 | goto no_page_table; | ||
1520 | if (pmd_trans_huge(*pmd)) { | 1515 | if (pmd_trans_huge(*pmd)) { |
1521 | if (flags & FOLL_SPLIT) { | 1516 | if (flags & FOLL_SPLIT) { |
1522 | split_huge_page_pmd(mm, pmd); | 1517 | split_huge_page_pmd(vma, address, pmd); |
1523 | goto split_fallthrough; | 1518 | goto split_fallthrough; |
1524 | } | 1519 | } |
1525 | spin_lock(&mm->page_table_lock); | 1520 | spin_lock(&mm->page_table_lock); |
@@ -1546,6 +1541,8 @@ split_fallthrough: | |||
1546 | pte = *ptep; | 1541 | pte = *ptep; |
1547 | if (!pte_present(pte)) | 1542 | if (!pte_present(pte)) |
1548 | goto no_page; | 1543 | goto no_page; |
1544 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
1545 | goto no_page; | ||
1549 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1546 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
1550 | goto unlock; | 1547 | goto unlock; |
1551 | 1548 | ||
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1697 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 1694 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
1698 | vm_flags &= (gup_flags & FOLL_FORCE) ? | 1695 | vm_flags &= (gup_flags & FOLL_FORCE) ? |
1699 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 1696 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
1697 | |||
1698 | /* | ||
1699 | * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault | ||
1700 | * would be called on PROT_NONE ranges. We must never invoke | ||
1701 | * handle_mm_fault on PROT_NONE ranges or the NUMA hinting | ||
1702 | * page faults would unprotect the PROT_NONE ranges if | ||
1703 | * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd | ||
1704 | * bitflag. So to avoid that, don't set FOLL_NUMA if | ||
1705 | * FOLL_FORCE is set. | ||
1706 | */ | ||
1707 | if (!(gup_flags & FOLL_FORCE)) | ||
1708 | gup_flags |= FOLL_NUMA; | ||
1709 | |||
1700 | i = 0; | 1710 | i = 0; |
1701 | 1711 | ||
1702 | do { | 1712 | do { |
@@ -2794,13 +2804,8 @@ unlock: | |||
2794 | oom_free_new: | 2804 | oom_free_new: |
2795 | page_cache_release(new_page); | 2805 | page_cache_release(new_page); |
2796 | oom: | 2806 | oom: |
2797 | if (old_page) { | 2807 | if (old_page) |
2798 | if (page_mkwrite) { | ||
2799 | unlock_page(old_page); | ||
2800 | page_cache_release(old_page); | ||
2801 | } | ||
2802 | page_cache_release(old_page); | 2808 | page_cache_release(old_page); |
2803 | } | ||
2804 | return VM_FAULT_OOM; | 2809 | return VM_FAULT_OOM; |
2805 | 2810 | ||
2806 | unwritable_page: | 2811 | unwritable_page: |
@@ -3431,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3431 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3436 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3432 | } | 3437 | } |
3433 | 3438 | ||
3439 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | ||
3440 | unsigned long addr, int current_nid) | ||
3441 | { | ||
3442 | get_page(page); | ||
3443 | |||
3444 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
3445 | if (current_nid == numa_node_id()) | ||
3446 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
3447 | |||
3448 | return mpol_misplaced(page, vma, addr); | ||
3449 | } | ||
3450 | |||
3451 | int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3452 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3453 | { | ||
3454 | struct page *page = NULL; | ||
3455 | spinlock_t *ptl; | ||
3456 | int current_nid = -1; | ||
3457 | int target_nid; | ||
3458 | bool migrated = false; | ||
3459 | |||
3460 | /* | ||
3461 | * The "pte" at this point cannot be used safely without | ||
3462 | * validation through pte_unmap_same(). It's of NUMA type but | ||
3463 | * the pfn may be screwed if the read is non atomic. | ||
3464 | * | ||
3465 | * ptep_modify_prot_start is not called as this is clearing | ||
3466 | * the _PAGE_NUMA bit and it is not really expected that there | ||
3467 | * would be concurrent hardware modifications to the PTE. | ||
3468 | */ | ||
3469 | ptl = pte_lockptr(mm, pmd); | ||
3470 | spin_lock(ptl); | ||
3471 | if (unlikely(!pte_same(*ptep, pte))) { | ||
3472 | pte_unmap_unlock(ptep, ptl); | ||
3473 | goto out; | ||
3474 | } | ||
3475 | |||
3476 | pte = pte_mknonnuma(pte); | ||
3477 | set_pte_at(mm, addr, ptep, pte); | ||
3478 | update_mmu_cache(vma, addr, ptep); | ||
3479 | |||
3480 | page = vm_normal_page(vma, addr, pte); | ||
3481 | if (!page) { | ||
3482 | pte_unmap_unlock(ptep, ptl); | ||
3483 | return 0; | ||
3484 | } | ||
3485 | |||
3486 | current_nid = page_to_nid(page); | ||
3487 | target_nid = numa_migrate_prep(page, vma, addr, current_nid); | ||
3488 | pte_unmap_unlock(ptep, ptl); | ||
3489 | if (target_nid == -1) { | ||
3490 | /* | ||
3491 | * Account for the fault against the current node if it not | ||
3492 | * being replaced regardless of where the page is located. | ||
3493 | */ | ||
3494 | current_nid = numa_node_id(); | ||
3495 | put_page(page); | ||
3496 | goto out; | ||
3497 | } | ||
3498 | |||
3499 | /* Migrate to the requested node */ | ||
3500 | migrated = migrate_misplaced_page(page, target_nid); | ||
3501 | if (migrated) | ||
3502 | current_nid = target_nid; | ||
3503 | |||
3504 | out: | ||
3505 | if (current_nid != -1) | ||
3506 | task_numa_fault(current_nid, 1, migrated); | ||
3507 | return 0; | ||
3508 | } | ||
3509 | |||
3510 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3511 | #ifdef CONFIG_NUMA_BALANCING | ||
3512 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3513 | unsigned long addr, pmd_t *pmdp) | ||
3514 | { | ||
3515 | pmd_t pmd; | ||
3516 | pte_t *pte, *orig_pte; | ||
3517 | unsigned long _addr = addr & PMD_MASK; | ||
3518 | unsigned long offset; | ||
3519 | spinlock_t *ptl; | ||
3520 | bool numa = false; | ||
3521 | int local_nid = numa_node_id(); | ||
3522 | |||
3523 | spin_lock(&mm->page_table_lock); | ||
3524 | pmd = *pmdp; | ||
3525 | if (pmd_numa(pmd)) { | ||
3526 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3527 | numa = true; | ||
3528 | } | ||
3529 | spin_unlock(&mm->page_table_lock); | ||
3530 | |||
3531 | if (!numa) | ||
3532 | return 0; | ||
3533 | |||
3534 | /* we're in a page fault so some vma must be in the range */ | ||
3535 | BUG_ON(!vma); | ||
3536 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3537 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3538 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3539 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3540 | pte += offset >> PAGE_SHIFT; | ||
3541 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3542 | pte_t pteval = *pte; | ||
3543 | struct page *page; | ||
3544 | int curr_nid = local_nid; | ||
3545 | int target_nid; | ||
3546 | bool migrated; | ||
3547 | if (!pte_present(pteval)) | ||
3548 | continue; | ||
3549 | if (!pte_numa(pteval)) | ||
3550 | continue; | ||
3551 | if (addr >= vma->vm_end) { | ||
3552 | vma = find_vma(mm, addr); | ||
3553 | /* there's a pte present so there must be a vma */ | ||
3554 | BUG_ON(!vma); | ||
3555 | BUG_ON(addr < vma->vm_start); | ||
3556 | } | ||
3557 | if (pte_numa(pteval)) { | ||
3558 | pteval = pte_mknonnuma(pteval); | ||
3559 | set_pte_at(mm, addr, pte, pteval); | ||
3560 | } | ||
3561 | page = vm_normal_page(vma, addr, pteval); | ||
3562 | if (unlikely(!page)) | ||
3563 | continue; | ||
3564 | /* only check non-shared pages */ | ||
3565 | if (unlikely(page_mapcount(page) != 1)) | ||
3566 | continue; | ||
3567 | |||
3568 | /* | ||
3569 | * Note that the NUMA fault is later accounted to either | ||
3570 | * the node that is currently running or where the page is | ||
3571 | * migrated to. | ||
3572 | */ | ||
3573 | curr_nid = local_nid; | ||
3574 | target_nid = numa_migrate_prep(page, vma, addr, | ||
3575 | page_to_nid(page)); | ||
3576 | if (target_nid == -1) { | ||
3577 | put_page(page); | ||
3578 | continue; | ||
3579 | } | ||
3580 | |||
3581 | /* Migrate to the requested node */ | ||
3582 | pte_unmap_unlock(pte, ptl); | ||
3583 | migrated = migrate_misplaced_page(page, target_nid); | ||
3584 | if (migrated) | ||
3585 | curr_nid = target_nid; | ||
3586 | task_numa_fault(curr_nid, 1, migrated); | ||
3587 | |||
3588 | pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
3589 | } | ||
3590 | pte_unmap_unlock(orig_pte, ptl); | ||
3591 | |||
3592 | return 0; | ||
3593 | } | ||
3594 | #else | ||
3595 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3596 | unsigned long addr, pmd_t *pmdp) | ||
3597 | { | ||
3598 | BUG(); | ||
3599 | return 0; | ||
3600 | } | ||
3601 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3602 | |||
3434 | /* | 3603 | /* |
3435 | * These routines also need to handle stuff like marking pages dirty | 3604 | * These routines also need to handle stuff like marking pages dirty |
3436 | * and/or accessed for architectures that don't do it in hardware (most | 3605 | * and/or accessed for architectures that don't do it in hardware (most |
@@ -3469,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm, | |||
3469 | pte, pmd, flags, entry); | 3638 | pte, pmd, flags, entry); |
3470 | } | 3639 | } |
3471 | 3640 | ||
3641 | if (pte_numa(entry)) | ||
3642 | return do_numa_page(mm, vma, address, entry, pte, pmd); | ||
3643 | |||
3472 | ptl = pte_lockptr(mm, pmd); | 3644 | ptl = pte_lockptr(mm, pmd); |
3473 | spin_lock(ptl); | 3645 | spin_lock(ptl); |
3474 | if (unlikely(!pte_same(*pte, entry))) | 3646 | if (unlikely(!pte_same(*pte, entry))) |
@@ -3537,9 +3709,21 @@ retry: | |||
3537 | 3709 | ||
3538 | barrier(); | 3710 | barrier(); |
3539 | if (pmd_trans_huge(orig_pmd)) { | 3711 | if (pmd_trans_huge(orig_pmd)) { |
3540 | if (flags & FAULT_FLAG_WRITE && | 3712 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3541 | !pmd_write(orig_pmd) && | 3713 | |
3542 | !pmd_trans_splitting(orig_pmd)) { | 3714 | /* |
3715 | * If the pmd is splitting, return and retry the | ||
3716 | * the fault. Alternative: wait until the split | ||
3717 | * is done, and goto retry. | ||
3718 | */ | ||
3719 | if (pmd_trans_splitting(orig_pmd)) | ||
3720 | return 0; | ||
3721 | |||
3722 | if (pmd_numa(orig_pmd)) | ||
3723 | return do_huge_pmd_numa_page(mm, vma, address, | ||
3724 | orig_pmd, pmd); | ||
3725 | |||
3726 | if (dirty && !pmd_write(orig_pmd)) { | ||
3543 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3727 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3544 | orig_pmd); | 3728 | orig_pmd); |
3545 | /* | 3729 | /* |
@@ -3550,17 +3734,25 @@ retry: | |||
3550 | if (unlikely(ret & VM_FAULT_OOM)) | 3734 | if (unlikely(ret & VM_FAULT_OOM)) |
3551 | goto retry; | 3735 | goto retry; |
3552 | return ret; | 3736 | return ret; |
3737 | } else { | ||
3738 | huge_pmd_set_accessed(mm, vma, address, pmd, | ||
3739 | orig_pmd, dirty); | ||
3553 | } | 3740 | } |
3741 | |||
3554 | return 0; | 3742 | return 0; |
3555 | } | 3743 | } |
3556 | } | 3744 | } |
3557 | 3745 | ||
3746 | if (pmd_numa(*pmd)) | ||
3747 | return do_pmd_numa_page(mm, vma, address, pmd); | ||
3748 | |||
3558 | /* | 3749 | /* |
3559 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3750 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3560 | * run pte_offset_map on the pmd, if an huge pmd could | 3751 | * run pte_offset_map on the pmd, if an huge pmd could |
3561 | * materialize from under us from a different thread. | 3752 | * materialize from under us from a different thread. |
3562 | */ | 3753 | */ |
3563 | if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) | 3754 | if (unlikely(pmd_none(*pmd)) && |
3755 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3564 | return VM_FAULT_OOM; | 3756 | return VM_FAULT_OOM; |
3565 | /* if an huge pmd materialized from under us just retry later */ | 3757 | /* if an huge pmd materialized from under us just retry later */ |
3566 | if (unlikely(pmd_trans_huge(*pmd))) | 3758 | if (unlikely(pmd_trans_huge(*pmd))) |
@@ -3940,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3940 | struct file *f = vma->vm_file; | 4132 | struct file *f = vma->vm_file; |
3941 | char *buf = (char *)__get_free_page(GFP_KERNEL); | 4133 | char *buf = (char *)__get_free_page(GFP_KERNEL); |
3942 | if (buf) { | 4134 | if (buf) { |
3943 | char *p, *s; | 4135 | char *p; |
3944 | 4136 | ||
3945 | p = d_path(&f->f_path, buf, PAGE_SIZE); | 4137 | p = d_path(&f->f_path, buf, PAGE_SIZE); |
3946 | if (IS_ERR(p)) | 4138 | if (IS_ERR(p)) |
3947 | p = "?"; | 4139 | p = "?"; |
3948 | s = strrchr(p, '/'); | 4140 | printk("%s%s[%lx+%lx]", prefix, kbasename(p), |
3949 | if (s) | ||
3950 | p = s+1; | ||
3951 | printk("%s%s[%lx+%lx]", prefix, p, | ||
3952 | vma->vm_start, | 4141 | vma->vm_start, |
3953 | vma->vm_end - vma->vm_start); | 4142 | vma->vm_end - vma->vm_start); |
3954 | free_page((unsigned long)buf); | 4143 | free_page((unsigned long)buf); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4eeacae2b91..d04ed87bfacb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, | |||
106 | void __ref put_page_bootmem(struct page *page) | 106 | void __ref put_page_bootmem(struct page *page) |
107 | { | 107 | { |
108 | unsigned long type; | 108 | unsigned long type; |
109 | static DEFINE_MUTEX(ppb_lock); | ||
109 | 110 | ||
110 | type = (unsigned long) page->lru.next; | 111 | type = (unsigned long) page->lru.next; |
111 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page) | |||
115 | ClearPagePrivate(page); | 116 | ClearPagePrivate(page); |
116 | set_page_private(page, 0); | 117 | set_page_private(page, 0); |
117 | INIT_LIST_HEAD(&page->lru); | 118 | INIT_LIST_HEAD(&page->lru); |
119 | |||
120 | /* | ||
121 | * Please refer to comment for __free_pages_bootmem() | ||
122 | * for why we serialize here. | ||
123 | */ | ||
124 | mutex_lock(&ppb_lock); | ||
118 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
126 | mutex_unlock(&ppb_lock); | ||
119 | } | 127 | } |
120 | 128 | ||
121 | } | 129 | } |
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
205 | zone_span_writelock(zone); | 213 | zone_span_writelock(zone); |
206 | 214 | ||
207 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 215 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; |
208 | if (start_pfn < zone->zone_start_pfn) | 216 | if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) |
209 | zone->zone_start_pfn = start_pfn; | 217 | zone->zone_start_pfn = start_pfn; |
210 | 218 | ||
211 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - | 219 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - |
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
214 | zone_span_writeunlock(zone); | 222 | zone_span_writeunlock(zone); |
215 | } | 223 | } |
216 | 224 | ||
225 | static void resize_zone(struct zone *zone, unsigned long start_pfn, | ||
226 | unsigned long end_pfn) | ||
227 | { | ||
228 | zone_span_writelock(zone); | ||
229 | |||
230 | if (end_pfn - start_pfn) { | ||
231 | zone->zone_start_pfn = start_pfn; | ||
232 | zone->spanned_pages = end_pfn - start_pfn; | ||
233 | } else { | ||
234 | /* | ||
235 | * make it consist as free_area_init_core(), | ||
236 | * if spanned_pages = 0, then keep start_pfn = 0 | ||
237 | */ | ||
238 | zone->zone_start_pfn = 0; | ||
239 | zone->spanned_pages = 0; | ||
240 | } | ||
241 | |||
242 | zone_span_writeunlock(zone); | ||
243 | } | ||
244 | |||
245 | static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | ||
246 | unsigned long end_pfn) | ||
247 | { | ||
248 | enum zone_type zid = zone_idx(zone); | ||
249 | int nid = zone->zone_pgdat->node_id; | ||
250 | unsigned long pfn; | ||
251 | |||
252 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||
253 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); | ||
254 | } | ||
255 | |||
256 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | ||
257 | unsigned long start_pfn, unsigned long end_pfn) | ||
258 | { | ||
259 | int ret; | ||
260 | unsigned long flags; | ||
261 | unsigned long z1_start_pfn; | ||
262 | |||
263 | if (!z1->wait_table) { | ||
264 | ret = init_currently_empty_zone(z1, start_pfn, | ||
265 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | ||
266 | if (ret) | ||
267 | return ret; | ||
268 | } | ||
269 | |||
270 | pgdat_resize_lock(z1->zone_pgdat, &flags); | ||
271 | |||
272 | /* can't move pfns which are higher than @z2 */ | ||
273 | if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | ||
274 | goto out_fail; | ||
275 | /* the move out part mast at the left most of @z2 */ | ||
276 | if (start_pfn > z2->zone_start_pfn) | ||
277 | goto out_fail; | ||
278 | /* must included/overlap */ | ||
279 | if (end_pfn <= z2->zone_start_pfn) | ||
280 | goto out_fail; | ||
281 | |||
282 | /* use start_pfn for z1's start_pfn if z1 is empty */ | ||
283 | if (z1->spanned_pages) | ||
284 | z1_start_pfn = z1->zone_start_pfn; | ||
285 | else | ||
286 | z1_start_pfn = start_pfn; | ||
287 | |||
288 | resize_zone(z1, z1_start_pfn, end_pfn); | ||
289 | resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | ||
290 | |||
291 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
292 | |||
293 | fix_zone_id(z1, start_pfn, end_pfn); | ||
294 | |||
295 | return 0; | ||
296 | out_fail: | ||
297 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
298 | return -1; | ||
299 | } | ||
300 | |||
301 | static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | ||
302 | unsigned long start_pfn, unsigned long end_pfn) | ||
303 | { | ||
304 | int ret; | ||
305 | unsigned long flags; | ||
306 | unsigned long z2_end_pfn; | ||
307 | |||
308 | if (!z2->wait_table) { | ||
309 | ret = init_currently_empty_zone(z2, start_pfn, | ||
310 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | ||
311 | if (ret) | ||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | pgdat_resize_lock(z1->zone_pgdat, &flags); | ||
316 | |||
317 | /* can't move pfns which are lower than @z1 */ | ||
318 | if (z1->zone_start_pfn > start_pfn) | ||
319 | goto out_fail; | ||
320 | /* the move out part mast at the right most of @z1 */ | ||
321 | if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | ||
322 | goto out_fail; | ||
323 | /* must included/overlap */ | ||
324 | if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | ||
325 | goto out_fail; | ||
326 | |||
327 | /* use end_pfn for z2's end_pfn if z2 is empty */ | ||
328 | if (z2->spanned_pages) | ||
329 | z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; | ||
330 | else | ||
331 | z2_end_pfn = end_pfn; | ||
332 | |||
333 | resize_zone(z1, z1->zone_start_pfn, start_pfn); | ||
334 | resize_zone(z2, start_pfn, z2_end_pfn); | ||
335 | |||
336 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
337 | |||
338 | fix_zone_id(z2, start_pfn, end_pfn); | ||
339 | |||
340 | return 0; | ||
341 | out_fail: | ||
342 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
343 | return -1; | ||
344 | } | ||
345 | |||
217 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | 346 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
218 | unsigned long end_pfn) | 347 | unsigned long end_pfn) |
219 | { | 348 | { |
220 | unsigned long old_pgdat_end_pfn = | 349 | unsigned long old_pgdat_end_pfn = |
221 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | 350 | pgdat->node_start_pfn + pgdat->node_spanned_pages; |
222 | 351 | ||
223 | if (start_pfn < pgdat->node_start_pfn) | 352 | if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) |
224 | pgdat->node_start_pfn = start_pfn; | 353 | pgdat->node_start_pfn = start_pfn; |
225 | 354 | ||
226 | pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - | 355 | pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - |
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
460 | return 0; | 589 | return 0; |
461 | } | 590 | } |
462 | 591 | ||
592 | #ifdef CONFIG_MOVABLE_NODE | ||
593 | /* | ||
594 | * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have | ||
595 | * normal memory. | ||
596 | */ | ||
597 | static bool can_online_high_movable(struct zone *zone) | ||
598 | { | ||
599 | return true; | ||
600 | } | ||
601 | #else /* CONFIG_MOVABLE_NODE */ | ||
602 | /* ensure every online node has NORMAL memory */ | ||
603 | static bool can_online_high_movable(struct zone *zone) | ||
604 | { | ||
605 | return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); | ||
606 | } | ||
607 | #endif /* CONFIG_MOVABLE_NODE */ | ||
463 | 608 | ||
464 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | 609 | /* check which state of node_states will be changed when online memory */ |
610 | static void node_states_check_changes_online(unsigned long nr_pages, | ||
611 | struct zone *zone, struct memory_notify *arg) | ||
612 | { | ||
613 | int nid = zone_to_nid(zone); | ||
614 | enum zone_type zone_last = ZONE_NORMAL; | ||
615 | |||
616 | /* | ||
617 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] | ||
618 | * contains nodes which have zones of 0...ZONE_NORMAL, | ||
619 | * set zone_last to ZONE_NORMAL. | ||
620 | * | ||
621 | * If we don't have HIGHMEM nor movable node, | ||
622 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of | ||
623 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
624 | */ | ||
625 | if (N_MEMORY == N_NORMAL_MEMORY) | ||
626 | zone_last = ZONE_MOVABLE; | ||
627 | |||
628 | /* | ||
629 | * if the memory to be online is in a zone of 0...zone_last, and | ||
630 | * the zones of 0...zone_last don't have memory before online, we will | ||
631 | * need to set the node to node_states[N_NORMAL_MEMORY] after | ||
632 | * the memory is online. | ||
633 | */ | ||
634 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) | ||
635 | arg->status_change_nid_normal = nid; | ||
636 | else | ||
637 | arg->status_change_nid_normal = -1; | ||
638 | |||
639 | #ifdef CONFIG_HIGHMEM | ||
640 | /* | ||
641 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
642 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
643 | * set zone_last to ZONE_HIGHMEM. | ||
644 | * | ||
645 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
646 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
647 | * set zone_last to ZONE_MOVABLE. | ||
648 | */ | ||
649 | zone_last = ZONE_HIGHMEM; | ||
650 | if (N_MEMORY == N_HIGH_MEMORY) | ||
651 | zone_last = ZONE_MOVABLE; | ||
652 | |||
653 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) | ||
654 | arg->status_change_nid_high = nid; | ||
655 | else | ||
656 | arg->status_change_nid_high = -1; | ||
657 | #else | ||
658 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
659 | #endif | ||
660 | |||
661 | /* | ||
662 | * if the node don't have memory befor online, we will need to | ||
663 | * set the node to node_states[N_MEMORY] after the memory | ||
664 | * is online. | ||
665 | */ | ||
666 | if (!node_state(nid, N_MEMORY)) | ||
667 | arg->status_change_nid = nid; | ||
668 | else | ||
669 | arg->status_change_nid = -1; | ||
670 | } | ||
671 | |||
672 | static void node_states_set_node(int node, struct memory_notify *arg) | ||
673 | { | ||
674 | if (arg->status_change_nid_normal >= 0) | ||
675 | node_set_state(node, N_NORMAL_MEMORY); | ||
676 | |||
677 | if (arg->status_change_nid_high >= 0) | ||
678 | node_set_state(node, N_HIGH_MEMORY); | ||
679 | |||
680 | node_set_state(node, N_MEMORY); | ||
681 | } | ||
682 | |||
683 | |||
684 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | ||
465 | { | 685 | { |
466 | unsigned long onlined_pages = 0; | 686 | unsigned long onlined_pages = 0; |
467 | struct zone *zone; | 687 | struct zone *zone; |
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
471 | struct memory_notify arg; | 691 | struct memory_notify arg; |
472 | 692 | ||
473 | lock_memory_hotplug(); | 693 | lock_memory_hotplug(); |
694 | /* | ||
695 | * This doesn't need a lock to do pfn_to_page(). | ||
696 | * The section can't be removed here because of the | ||
697 | * memory_block->state_mutex. | ||
698 | */ | ||
699 | zone = page_zone(pfn_to_page(pfn)); | ||
700 | |||
701 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && | ||
702 | !can_online_high_movable(zone)) { | ||
703 | unlock_memory_hotplug(); | ||
704 | return -1; | ||
705 | } | ||
706 | |||
707 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | ||
708 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | ||
709 | unlock_memory_hotplug(); | ||
710 | return -1; | ||
711 | } | ||
712 | } | ||
713 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | ||
714 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | ||
715 | unlock_memory_hotplug(); | ||
716 | return -1; | ||
717 | } | ||
718 | } | ||
719 | |||
720 | /* Previous code may changed the zone of the pfn range */ | ||
721 | zone = page_zone(pfn_to_page(pfn)); | ||
722 | |||
474 | arg.start_pfn = pfn; | 723 | arg.start_pfn = pfn; |
475 | arg.nr_pages = nr_pages; | 724 | arg.nr_pages = nr_pages; |
476 | arg.status_change_nid = -1; | 725 | node_states_check_changes_online(nr_pages, zone, &arg); |
477 | 726 | ||
478 | nid = page_to_nid(pfn_to_page(pfn)); | 727 | nid = page_to_nid(pfn_to_page(pfn)); |
479 | if (node_present_pages(nid) == 0) | ||
480 | arg.status_change_nid = nid; | ||
481 | 728 | ||
482 | ret = memory_notify(MEM_GOING_ONLINE, &arg); | 729 | ret = memory_notify(MEM_GOING_ONLINE, &arg); |
483 | ret = notifier_to_errno(ret); | 730 | ret = notifier_to_errno(ret); |
@@ -487,23 +734,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
487 | return ret; | 734 | return ret; |
488 | } | 735 | } |
489 | /* | 736 | /* |
490 | * This doesn't need a lock to do pfn_to_page(). | ||
491 | * The section can't be removed here because of the | ||
492 | * memory_block->state_mutex. | ||
493 | */ | ||
494 | zone = page_zone(pfn_to_page(pfn)); | ||
495 | /* | ||
496 | * If this zone is not populated, then it is not in zonelist. | 737 | * If this zone is not populated, then it is not in zonelist. |
497 | * This means the page allocator ignores this zone. | 738 | * This means the page allocator ignores this zone. |
498 | * So, zonelist must be updated after online. | 739 | * So, zonelist must be updated after online. |
499 | */ | 740 | */ |
500 | mutex_lock(&zonelists_mutex); | 741 | mutex_lock(&zonelists_mutex); |
501 | if (!populated_zone(zone)) | 742 | if (!populated_zone(zone)) { |
502 | need_zonelists_rebuild = 1; | 743 | need_zonelists_rebuild = 1; |
744 | build_all_zonelists(NULL, zone); | ||
745 | } | ||
503 | 746 | ||
504 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 747 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
505 | online_pages_range); | 748 | online_pages_range); |
506 | if (ret) { | 749 | if (ret) { |
750 | if (need_zonelists_rebuild) | ||
751 | zone_pcp_reset(zone); | ||
507 | mutex_unlock(&zonelists_mutex); | 752 | mutex_unlock(&zonelists_mutex); |
508 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", | 753 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", |
509 | (unsigned long long) pfn << PAGE_SHIFT, | 754 | (unsigned long long) pfn << PAGE_SHIFT, |
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
514 | return ret; | 759 | return ret; |
515 | } | 760 | } |
516 | 761 | ||
762 | zone->managed_pages += onlined_pages; | ||
517 | zone->present_pages += onlined_pages; | 763 | zone->present_pages += onlined_pages; |
518 | zone->zone_pgdat->node_present_pages += onlined_pages; | 764 | zone->zone_pgdat->node_present_pages += onlined_pages; |
519 | if (onlined_pages) { | 765 | if (onlined_pages) { |
520 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 766 | node_states_set_node(zone_to_nid(zone), &arg); |
521 | if (need_zonelists_rebuild) | 767 | if (need_zonelists_rebuild) |
522 | build_all_zonelists(NULL, zone); | 768 | build_all_zonelists(NULL, NULL); |
523 | else | 769 | else |
524 | zone_pcp_update(zone); | 770 | zone_pcp_update(zone); |
525 | } | 771 | } |
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
812 | * migrate_pages returns # of failed pages. | 1058 | * migrate_pages returns # of failed pages. |
813 | */ | 1059 | */ |
814 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1060 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
815 | true, MIGRATE_SYNC); | 1061 | true, MIGRATE_SYNC, |
1062 | MR_MEMORY_HOTPLUG); | ||
816 | if (ret) | 1063 | if (ret) |
817 | putback_lru_pages(&source); | 1064 | putback_lru_pages(&source); |
818 | } | 1065 | } |
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, | |||
847 | { | 1094 | { |
848 | int ret; | 1095 | int ret; |
849 | long offlined = *(long *)data; | 1096 | long offlined = *(long *)data; |
850 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); | 1097 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); |
851 | offlined = nr_pages; | 1098 | offlined = nr_pages; |
852 | if (!ret) | 1099 | if (!ret) |
853 | *(long *)data += offlined; | 1100 | *(long *)data += offlined; |
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
867 | return offlined; | 1114 | return offlined; |
868 | } | 1115 | } |
869 | 1116 | ||
1117 | #ifdef CONFIG_MOVABLE_NODE | ||
1118 | /* | ||
1119 | * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have | ||
1120 | * normal memory. | ||
1121 | */ | ||
1122 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | ||
1123 | { | ||
1124 | return true; | ||
1125 | } | ||
1126 | #else /* CONFIG_MOVABLE_NODE */ | ||
1127 | /* ensure the node has NORMAL memory if it is still online */ | ||
1128 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | ||
1129 | { | ||
1130 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1131 | unsigned long present_pages = 0; | ||
1132 | enum zone_type zt; | ||
1133 | |||
1134 | for (zt = 0; zt <= ZONE_NORMAL; zt++) | ||
1135 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1136 | |||
1137 | if (present_pages > nr_pages) | ||
1138 | return true; | ||
1139 | |||
1140 | present_pages = 0; | ||
1141 | for (; zt <= ZONE_MOVABLE; zt++) | ||
1142 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1143 | |||
1144 | /* | ||
1145 | * we can't offline the last normal memory until all | ||
1146 | * higher memory is offlined. | ||
1147 | */ | ||
1148 | return present_pages == 0; | ||
1149 | } | ||
1150 | #endif /* CONFIG_MOVABLE_NODE */ | ||
1151 | |||
1152 | /* check which state of node_states will be changed when offline memory */ | ||
1153 | static void node_states_check_changes_offline(unsigned long nr_pages, | ||
1154 | struct zone *zone, struct memory_notify *arg) | ||
1155 | { | ||
1156 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1157 | unsigned long present_pages = 0; | ||
1158 | enum zone_type zt, zone_last = ZONE_NORMAL; | ||
1159 | |||
1160 | /* | ||
1161 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] | ||
1162 | * contains nodes which have zones of 0...ZONE_NORMAL, | ||
1163 | * set zone_last to ZONE_NORMAL. | ||
1164 | * | ||
1165 | * If we don't have HIGHMEM nor movable node, | ||
1166 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of | ||
1167 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
1168 | */ | ||
1169 | if (N_MEMORY == N_NORMAL_MEMORY) | ||
1170 | zone_last = ZONE_MOVABLE; | ||
1171 | |||
1172 | /* | ||
1173 | * check whether node_states[N_NORMAL_MEMORY] will be changed. | ||
1174 | * If the memory to be offline is in a zone of 0...zone_last, | ||
1175 | * and it is the last present memory, 0...zone_last will | ||
1176 | * become empty after offline , thus we can determind we will | ||
1177 | * need to clear the node from node_states[N_NORMAL_MEMORY]. | ||
1178 | */ | ||
1179 | for (zt = 0; zt <= zone_last; zt++) | ||
1180 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1181 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
1182 | arg->status_change_nid_normal = zone_to_nid(zone); | ||
1183 | else | ||
1184 | arg->status_change_nid_normal = -1; | ||
1185 | |||
1186 | #ifdef CONFIG_HIGHMEM | ||
1187 | /* | ||
1188 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
1189 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
1190 | * set zone_last to ZONE_HIGHMEM. | ||
1191 | * | ||
1192 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
1193 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
1194 | * set zone_last to ZONE_MOVABLE. | ||
1195 | */ | ||
1196 | zone_last = ZONE_HIGHMEM; | ||
1197 | if (N_MEMORY == N_HIGH_MEMORY) | ||
1198 | zone_last = ZONE_MOVABLE; | ||
1199 | |||
1200 | for (; zt <= zone_last; zt++) | ||
1201 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1202 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
1203 | arg->status_change_nid_high = zone_to_nid(zone); | ||
1204 | else | ||
1205 | arg->status_change_nid_high = -1; | ||
1206 | #else | ||
1207 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
1208 | #endif | ||
1209 | |||
1210 | /* | ||
1211 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE | ||
1212 | */ | ||
1213 | zone_last = ZONE_MOVABLE; | ||
1214 | |||
1215 | /* | ||
1216 | * check whether node_states[N_HIGH_MEMORY] will be changed | ||
1217 | * If we try to offline the last present @nr_pages from the node, | ||
1218 | * we can determind we will need to clear the node from | ||
1219 | * node_states[N_HIGH_MEMORY]. | ||
1220 | */ | ||
1221 | for (; zt <= zone_last; zt++) | ||
1222 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1223 | if (nr_pages >= present_pages) | ||
1224 | arg->status_change_nid = zone_to_nid(zone); | ||
1225 | else | ||
1226 | arg->status_change_nid = -1; | ||
1227 | } | ||
1228 | |||
1229 | static void node_states_clear_node(int node, struct memory_notify *arg) | ||
1230 | { | ||
1231 | if (arg->status_change_nid_normal >= 0) | ||
1232 | node_clear_state(node, N_NORMAL_MEMORY); | ||
1233 | |||
1234 | if ((N_MEMORY != N_NORMAL_MEMORY) && | ||
1235 | (arg->status_change_nid_high >= 0)) | ||
1236 | node_clear_state(node, N_HIGH_MEMORY); | ||
1237 | |||
1238 | if ((N_MEMORY != N_HIGH_MEMORY) && | ||
1239 | (arg->status_change_nid >= 0)) | ||
1240 | node_clear_state(node, N_MEMORY); | ||
1241 | } | ||
1242 | |||
870 | static int __ref __offline_pages(unsigned long start_pfn, | 1243 | static int __ref __offline_pages(unsigned long start_pfn, |
871 | unsigned long end_pfn, unsigned long timeout) | 1244 | unsigned long end_pfn, unsigned long timeout) |
872 | { | 1245 | { |
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
893 | node = zone_to_nid(zone); | 1266 | node = zone_to_nid(zone); |
894 | nr_pages = end_pfn - start_pfn; | 1267 | nr_pages = end_pfn - start_pfn; |
895 | 1268 | ||
1269 | ret = -EINVAL; | ||
1270 | if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) | ||
1271 | goto out; | ||
1272 | |||
896 | /* set above range as isolated */ | 1273 | /* set above range as isolated */ |
897 | ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1274 | ret = start_isolate_page_range(start_pfn, end_pfn, |
1275 | MIGRATE_MOVABLE, true); | ||
898 | if (ret) | 1276 | if (ret) |
899 | goto out; | 1277 | goto out; |
900 | 1278 | ||
901 | arg.start_pfn = start_pfn; | 1279 | arg.start_pfn = start_pfn; |
902 | arg.nr_pages = nr_pages; | 1280 | arg.nr_pages = nr_pages; |
903 | arg.status_change_nid = -1; | 1281 | node_states_check_changes_offline(nr_pages, zone, &arg); |
904 | if (nr_pages >= node_present_pages(node)) | ||
905 | arg.status_change_nid = node; | ||
906 | 1282 | ||
907 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); | 1283 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); |
908 | ret = notifier_to_errno(ret); | 1284 | ret = notifier_to_errno(ret); |
@@ -943,10 +1319,10 @@ repeat: | |||
943 | goto repeat; | 1319 | goto repeat; |
944 | } | 1320 | } |
945 | } | 1321 | } |
946 | /* drain all zone's lru pagevec, this is asyncronous... */ | 1322 | /* drain all zone's lru pagevec, this is asynchronous... */ |
947 | lru_add_drain_all(); | 1323 | lru_add_drain_all(); |
948 | yield(); | 1324 | yield(); |
949 | /* drain pcp pages , this is synchrouns. */ | 1325 | /* drain pcp pages, this is synchronous. */ |
950 | drain_all_pages(); | 1326 | drain_all_pages(); |
951 | /* check again */ | 1327 | /* check again */ |
952 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | 1328 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); |
@@ -955,12 +1331,13 @@ repeat: | |||
955 | goto failed_removal; | 1331 | goto failed_removal; |
956 | } | 1332 | } |
957 | printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); | 1333 | printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); |
958 | /* Ok, all of our target is islaoted. | 1334 | /* Ok, all of our target is isolated. |
959 | We cannot do rollback at this point. */ | 1335 | We cannot do rollback at this point. */ |
960 | offline_isolated_pages(start_pfn, end_pfn); | 1336 | offline_isolated_pages(start_pfn, end_pfn); |
961 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 1337 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
962 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1338 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
963 | /* removal success */ | 1339 | /* removal success */ |
1340 | zone->managed_pages -= offlined_pages; | ||
964 | zone->present_pages -= offlined_pages; | 1341 | zone->present_pages -= offlined_pages; |
965 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 1342 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
966 | totalram_pages -= offlined_pages; | 1343 | totalram_pages -= offlined_pages; |
@@ -975,10 +1352,9 @@ repeat: | |||
975 | } else | 1352 | } else |
976 | zone_pcp_update(zone); | 1353 | zone_pcp_update(zone); |
977 | 1354 | ||
978 | if (!node_present_pages(node)) { | 1355 | node_states_clear_node(node, &arg); |
979 | node_clear_state(node, N_HIGH_MEMORY); | 1356 | if (arg.status_change_nid >= 0) |
980 | kswapd_stop(node); | 1357 | kswapd_stop(node); |
981 | } | ||
982 | 1358 | ||
983 | vm_total_pages = nr_free_pagecache_pages(); | 1359 | vm_total_pages = nr_free_pagecache_pages(); |
984 | writeback_set_ratelimit(); | 1360 | writeback_set_ratelimit(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d04a8a54c294..e2df1c1fb41f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -90,6 +90,7 @@ | |||
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | 92 | #include <linux/mm_inline.h> |
93 | #include <linux/mmu_notifier.h> | ||
93 | 94 | ||
94 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = { | |||
117 | .flags = MPOL_F_LOCAL, | 118 | .flags = MPOL_F_LOCAL, |
118 | }; | 119 | }; |
119 | 120 | ||
121 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | ||
122 | |||
123 | static struct mempolicy *get_task_policy(struct task_struct *p) | ||
124 | { | ||
125 | struct mempolicy *pol = p->mempolicy; | ||
126 | int node; | ||
127 | |||
128 | if (!pol) { | ||
129 | node = numa_node_id(); | ||
130 | if (node != -1) | ||
131 | pol = &preferred_node_policy[node]; | ||
132 | |||
133 | /* preferred_node_policy is not initialised early in boot */ | ||
134 | if (!pol->mode) | ||
135 | pol = NULL; | ||
136 | } | ||
137 | |||
138 | return pol; | ||
139 | } | ||
140 | |||
120 | static const struct mempolicy_operations { | 141 | static const struct mempolicy_operations { |
121 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); | 142 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); |
122 | /* | 143 | /* |
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, | |||
212 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | 233 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ |
213 | if (pol == NULL) | 234 | if (pol == NULL) |
214 | return 0; | 235 | return 0; |
215 | /* Check N_HIGH_MEMORY */ | 236 | /* Check N_MEMORY */ |
216 | nodes_and(nsc->mask1, | 237 | nodes_and(nsc->mask1, |
217 | cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); | 238 | cpuset_current_mems_allowed, node_states[N_MEMORY]); |
218 | 239 | ||
219 | VM_BUG_ON(!nodes); | 240 | VM_BUG_ON(!nodes); |
220 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | 241 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) |
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
254 | if (mode == MPOL_DEFAULT) { | 275 | if (mode == MPOL_DEFAULT) { |
255 | if (nodes && !nodes_empty(*nodes)) | 276 | if (nodes && !nodes_empty(*nodes)) |
256 | return ERR_PTR(-EINVAL); | 277 | return ERR_PTR(-EINVAL); |
257 | return NULL; /* simply delete any existing policy */ | 278 | return NULL; |
258 | } | 279 | } |
259 | VM_BUG_ON(!nodes); | 280 | VM_BUG_ON(!nodes); |
260 | 281 | ||
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
269 | (flags & MPOL_F_RELATIVE_NODES))) | 290 | (flags & MPOL_F_RELATIVE_NODES))) |
270 | return ERR_PTR(-EINVAL); | 291 | return ERR_PTR(-EINVAL); |
271 | } | 292 | } |
293 | } else if (mode == MPOL_LOCAL) { | ||
294 | if (!nodes_empty(*nodes)) | ||
295 | return ERR_PTR(-EINVAL); | ||
296 | mode = MPOL_PREFERRED; | ||
272 | } else if (nodes_empty(*nodes)) | 297 | } else if (nodes_empty(*nodes)) |
273 | return ERR_PTR(-EINVAL); | 298 | return ERR_PTR(-EINVAL); |
274 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 299 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
511 | pmd = pmd_offset(pud, addr); | 536 | pmd = pmd_offset(pud, addr); |
512 | do { | 537 | do { |
513 | next = pmd_addr_end(addr, end); | 538 | next = pmd_addr_end(addr, end); |
514 | split_huge_page_pmd(vma->vm_mm, pmd); | 539 | split_huge_page_pmd(vma, addr, pmd); |
515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 540 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
516 | continue; | 541 | continue; |
517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 542 | if (check_pte_range(vma, pmd, addr, next, nodes, |
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
561 | return 0; | 586 | return 0; |
562 | } | 587 | } |
563 | 588 | ||
589 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
590 | /* | ||
591 | * This is used to mark a range of virtual addresses to be inaccessible. | ||
592 | * These are later cleared by a NUMA hinting fault. Depending on these | ||
593 | * faults, pages may be migrated for better NUMA placement. | ||
594 | * | ||
595 | * This is assuming that NUMA faults are handled using PROT_NONE. If | ||
596 | * an architecture makes a different choice, it will need further | ||
597 | * changes to the core. | ||
598 | */ | ||
599 | unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
600 | unsigned long addr, unsigned long end) | ||
601 | { | ||
602 | int nr_updated; | ||
603 | BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); | ||
604 | |||
605 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | ||
606 | if (nr_updated) | ||
607 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | ||
608 | |||
609 | return nr_updated; | ||
610 | } | ||
611 | #else | ||
612 | static unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
613 | unsigned long addr, unsigned long end) | ||
614 | { | ||
615 | return 0; | ||
616 | } | ||
617 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | ||
618 | |||
564 | /* | 619 | /* |
565 | * Check if all pages in a range are on a set of nodes. | 620 | * Check if all pages in a range are on a set of nodes. |
566 | * If pagelist != NULL then isolate pages from the LRU and | 621 | * If pagelist != NULL then isolate pages from the LRU and |
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
579 | return ERR_PTR(-EFAULT); | 634 | return ERR_PTR(-EFAULT); |
580 | prev = NULL; | 635 | prev = NULL; |
581 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 636 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
637 | unsigned long endvma = vma->vm_end; | ||
638 | |||
639 | if (endvma > end) | ||
640 | endvma = end; | ||
641 | if (vma->vm_start > start) | ||
642 | start = vma->vm_start; | ||
643 | |||
582 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 644 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
583 | if (!vma->vm_next && vma->vm_end < end) | 645 | if (!vma->vm_next && vma->vm_end < end) |
584 | return ERR_PTR(-EFAULT); | 646 | return ERR_PTR(-EFAULT); |
585 | if (prev && prev->vm_end < vma->vm_start) | 647 | if (prev && prev->vm_end < vma->vm_start) |
586 | return ERR_PTR(-EFAULT); | 648 | return ERR_PTR(-EFAULT); |
587 | } | 649 | } |
588 | if (!is_vm_hugetlb_page(vma) && | 650 | |
589 | ((flags & MPOL_MF_STRICT) || | 651 | if (is_vm_hugetlb_page(vma)) |
652 | goto next; | ||
653 | |||
654 | if (flags & MPOL_MF_LAZY) { | ||
655 | change_prot_numa(vma, start, endvma); | ||
656 | goto next; | ||
657 | } | ||
658 | |||
659 | if ((flags & MPOL_MF_STRICT) || | ||
590 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | 660 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && |
591 | vma_migratable(vma)))) { | 661 | vma_migratable(vma))) { |
592 | unsigned long endvma = vma->vm_end; | ||
593 | 662 | ||
594 | if (endvma > end) | ||
595 | endvma = end; | ||
596 | if (vma->vm_start > start) | ||
597 | start = vma->vm_start; | ||
598 | err = check_pgd_range(vma, start, endvma, nodes, | 663 | err = check_pgd_range(vma, start, endvma, nodes, |
599 | flags, private); | 664 | flags, private); |
600 | if (err) { | 665 | if (err) { |
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
602 | break; | 667 | break; |
603 | } | 668 | } |
604 | } | 669 | } |
670 | next: | ||
605 | prev = vma; | 671 | prev = vma; |
606 | } | 672 | } |
607 | return first; | 673 | return first; |
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
961 | 1027 | ||
962 | if (!list_empty(&pagelist)) { | 1028 | if (!list_empty(&pagelist)) { |
963 | err = migrate_pages(&pagelist, new_node_page, dest, | 1029 | err = migrate_pages(&pagelist, new_node_page, dest, |
964 | false, MIGRATE_SYNC); | 1030 | false, MIGRATE_SYNC, |
1031 | MR_SYSCALL); | ||
965 | if (err) | 1032 | if (err) |
966 | putback_lru_pages(&pagelist); | 1033 | putback_lru_pages(&pagelist); |
967 | } | 1034 | } |
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1133 | int err; | 1200 | int err; |
1134 | LIST_HEAD(pagelist); | 1201 | LIST_HEAD(pagelist); |
1135 | 1202 | ||
1136 | if (flags & ~(unsigned long)(MPOL_MF_STRICT | | 1203 | if (flags & ~(unsigned long)MPOL_MF_VALID) |
1137 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
1138 | return -EINVAL; | 1204 | return -EINVAL; |
1139 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | 1205 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
1140 | return -EPERM; | 1206 | return -EPERM; |
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1157 | if (IS_ERR(new)) | 1223 | if (IS_ERR(new)) |
1158 | return PTR_ERR(new); | 1224 | return PTR_ERR(new); |
1159 | 1225 | ||
1226 | if (flags & MPOL_MF_LAZY) | ||
1227 | new->flags |= MPOL_F_MOF; | ||
1228 | |||
1160 | /* | 1229 | /* |
1161 | * If we are using the default policy then operation | 1230 | * If we are using the default policy then operation |
1162 | * on discontinuous address spaces is okay after all | 1231 | * on discontinuous address spaces is okay after all |
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1193 | vma = check_range(mm, start, end, nmask, | 1262 | vma = check_range(mm, start, end, nmask, |
1194 | flags | MPOL_MF_INVERT, &pagelist); | 1263 | flags | MPOL_MF_INVERT, &pagelist); |
1195 | 1264 | ||
1196 | err = PTR_ERR(vma); | 1265 | err = PTR_ERR(vma); /* maybe ... */ |
1197 | if (!IS_ERR(vma)) { | 1266 | if (!IS_ERR(vma)) |
1198 | int nr_failed = 0; | ||
1199 | |||
1200 | err = mbind_range(mm, start, end, new); | 1267 | err = mbind_range(mm, start, end, new); |
1201 | 1268 | ||
1269 | if (!err) { | ||
1270 | int nr_failed = 0; | ||
1271 | |||
1202 | if (!list_empty(&pagelist)) { | 1272 | if (!list_empty(&pagelist)) { |
1273 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | ||
1203 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1274 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1204 | (unsigned long)vma, | 1275 | (unsigned long)vma, |
1205 | false, MIGRATE_SYNC); | 1276 | false, MIGRATE_SYNC, |
1277 | MR_MEMPOLICY_MBIND); | ||
1206 | if (nr_failed) | 1278 | if (nr_failed) |
1207 | putback_lru_pages(&pagelist); | 1279 | putback_lru_pages(&pagelist); |
1208 | } | 1280 | } |
1209 | 1281 | ||
1210 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1282 | if (nr_failed && (flags & MPOL_MF_STRICT)) |
1211 | err = -EIO; | 1283 | err = -EIO; |
1212 | } else | 1284 | } else |
1213 | putback_lru_pages(&pagelist); | 1285 | putback_lru_pages(&pagelist); |
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1388 | goto out_put; | 1460 | goto out_put; |
1389 | } | 1461 | } |
1390 | 1462 | ||
1391 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { | 1463 | if (!nodes_subset(*new, node_states[N_MEMORY])) { |
1392 | err = -EINVAL; | 1464 | err = -EINVAL; |
1393 | goto out_put; | 1465 | goto out_put; |
1394 | } | 1466 | } |
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1546 | struct mempolicy *get_vma_policy(struct task_struct *task, | 1618 | struct mempolicy *get_vma_policy(struct task_struct *task, |
1547 | struct vm_area_struct *vma, unsigned long addr) | 1619 | struct vm_area_struct *vma, unsigned long addr) |
1548 | { | 1620 | { |
1549 | struct mempolicy *pol = task->mempolicy; | 1621 | struct mempolicy *pol = get_task_policy(task); |
1550 | 1622 | ||
1551 | if (vma) { | 1623 | if (vma) { |
1552 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1624 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1907 | unsigned long addr, int node) | 1979 | unsigned long addr, int node) |
1908 | { | 1980 | { |
1909 | struct mempolicy *pol; | 1981 | struct mempolicy *pol; |
1910 | struct zonelist *zl; | ||
1911 | struct page *page; | 1982 | struct page *page; |
1912 | unsigned int cpuset_mems_cookie; | 1983 | unsigned int cpuset_mems_cookie; |
1913 | 1984 | ||
@@ -1926,23 +1997,11 @@ retry_cpuset: | |||
1926 | 1997 | ||
1927 | return page; | 1998 | return page; |
1928 | } | 1999 | } |
1929 | zl = policy_zonelist(gfp, pol, node); | 2000 | page = __alloc_pages_nodemask(gfp, order, |
1930 | if (unlikely(mpol_needs_cond_ref(pol))) { | 2001 | policy_zonelist(gfp, pol, node), |
1931 | /* | ||
1932 | * slow path: ref counted shared policy | ||
1933 | */ | ||
1934 | struct page *page = __alloc_pages_nodemask(gfp, order, | ||
1935 | zl, policy_nodemask(gfp, pol)); | ||
1936 | __mpol_put(pol); | ||
1937 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1938 | goto retry_cpuset; | ||
1939 | return page; | ||
1940 | } | ||
1941 | /* | ||
1942 | * fast path: default or task policy | ||
1943 | */ | ||
1944 | page = __alloc_pages_nodemask(gfp, order, zl, | ||
1945 | policy_nodemask(gfp, pol)); | 2002 | policy_nodemask(gfp, pol)); |
2003 | if (unlikely(mpol_needs_cond_ref(pol))) | ||
2004 | __mpol_put(pol); | ||
1946 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2005 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1947 | goto retry_cpuset; | 2006 | goto retry_cpuset; |
1948 | return page; | 2007 | return page; |
@@ -1969,7 +2028,7 @@ retry_cpuset: | |||
1969 | */ | 2028 | */ |
1970 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 2029 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
1971 | { | 2030 | { |
1972 | struct mempolicy *pol = current->mempolicy; | 2031 | struct mempolicy *pol = get_task_policy(current); |
1973 | struct page *page; | 2032 | struct page *page; |
1974 | unsigned int cpuset_mems_cookie; | 2033 | unsigned int cpuset_mems_cookie; |
1975 | 2034 | ||
@@ -2037,28 +2096,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
2037 | return new; | 2096 | return new; |
2038 | } | 2097 | } |
2039 | 2098 | ||
2040 | /* | ||
2041 | * If *frompol needs [has] an extra ref, copy *frompol to *tompol , | ||
2042 | * eliminate the * MPOL_F_* flags that require conditional ref and | ||
2043 | * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly | ||
2044 | * after return. Use the returned value. | ||
2045 | * | ||
2046 | * Allows use of a mempolicy for, e.g., multiple allocations with a single | ||
2047 | * policy lookup, even if the policy needs/has extra ref on lookup. | ||
2048 | * shmem_readahead needs this. | ||
2049 | */ | ||
2050 | struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | ||
2051 | struct mempolicy *frompol) | ||
2052 | { | ||
2053 | if (!mpol_needs_cond_ref(frompol)) | ||
2054 | return frompol; | ||
2055 | |||
2056 | *tompol = *frompol; | ||
2057 | tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ | ||
2058 | __mpol_put(frompol); | ||
2059 | return tompol; | ||
2060 | } | ||
2061 | |||
2062 | /* Slow path of a mempolicy comparison */ | 2099 | /* Slow path of a mempolicy comparison */ |
2063 | bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) | 2100 | bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) |
2064 | { | 2101 | { |
@@ -2095,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
2095 | */ | 2132 | */ |
2096 | 2133 | ||
2097 | /* lookup first element intersecting start-end */ | 2134 | /* lookup first element intersecting start-end */ |
2098 | /* Caller holds sp->mutex */ | 2135 | /* Caller holds sp->lock */ |
2099 | static struct sp_node * | 2136 | static struct sp_node * |
2100 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) | 2137 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) |
2101 | { | 2138 | { |
@@ -2159,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | |||
2159 | 2196 | ||
2160 | if (!sp->root.rb_node) | 2197 | if (!sp->root.rb_node) |
2161 | return NULL; | 2198 | return NULL; |
2162 | mutex_lock(&sp->mutex); | 2199 | spin_lock(&sp->lock); |
2163 | sn = sp_lookup(sp, idx, idx+1); | 2200 | sn = sp_lookup(sp, idx, idx+1); |
2164 | if (sn) { | 2201 | if (sn) { |
2165 | mpol_get(sn->policy); | 2202 | mpol_get(sn->policy); |
2166 | pol = sn->policy; | 2203 | pol = sn->policy; |
2167 | } | 2204 | } |
2168 | mutex_unlock(&sp->mutex); | 2205 | spin_unlock(&sp->lock); |
2169 | return pol; | 2206 | return pol; |
2170 | } | 2207 | } |
2171 | 2208 | ||
@@ -2175,6 +2212,115 @@ static void sp_free(struct sp_node *n) | |||
2175 | kmem_cache_free(sn_cache, n); | 2212 | kmem_cache_free(sn_cache, n); |
2176 | } | 2213 | } |
2177 | 2214 | ||
2215 | /** | ||
2216 | * mpol_misplaced - check whether current page node is valid in policy | ||
2217 | * | ||
2218 | * @page - page to be checked | ||
2219 | * @vma - vm area where page mapped | ||
2220 | * @addr - virtual address where page mapped | ||
2221 | * | ||
2222 | * Lookup current policy node id for vma,addr and "compare to" page's | ||
2223 | * node id. | ||
2224 | * | ||
2225 | * Returns: | ||
2226 | * -1 - not misplaced, page is in the right node | ||
2227 | * node - node id where the page should be | ||
2228 | * | ||
2229 | * Policy determination "mimics" alloc_page_vma(). | ||
2230 | * Called from fault path where we know the vma and faulting address. | ||
2231 | */ | ||
2232 | int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) | ||
2233 | { | ||
2234 | struct mempolicy *pol; | ||
2235 | struct zone *zone; | ||
2236 | int curnid = page_to_nid(page); | ||
2237 | unsigned long pgoff; | ||
2238 | int polnid = -1; | ||
2239 | int ret = -1; | ||
2240 | |||
2241 | BUG_ON(!vma); | ||
2242 | |||
2243 | pol = get_vma_policy(current, vma, addr); | ||
2244 | if (!(pol->flags & MPOL_F_MOF)) | ||
2245 | goto out; | ||
2246 | |||
2247 | switch (pol->mode) { | ||
2248 | case MPOL_INTERLEAVE: | ||
2249 | BUG_ON(addr >= vma->vm_end); | ||
2250 | BUG_ON(addr < vma->vm_start); | ||
2251 | |||
2252 | pgoff = vma->vm_pgoff; | ||
2253 | pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
2254 | polnid = offset_il_node(pol, vma, pgoff); | ||
2255 | break; | ||
2256 | |||
2257 | case MPOL_PREFERRED: | ||
2258 | if (pol->flags & MPOL_F_LOCAL) | ||
2259 | polnid = numa_node_id(); | ||
2260 | else | ||
2261 | polnid = pol->v.preferred_node; | ||
2262 | break; | ||
2263 | |||
2264 | case MPOL_BIND: | ||
2265 | /* | ||
2266 | * allows binding to multiple nodes. | ||
2267 | * use current page if in policy nodemask, | ||
2268 | * else select nearest allowed node, if any. | ||
2269 | * If no allowed nodes, use current [!misplaced]. | ||
2270 | */ | ||
2271 | if (node_isset(curnid, pol->v.nodes)) | ||
2272 | goto out; | ||
2273 | (void)first_zones_zonelist( | ||
2274 | node_zonelist(numa_node_id(), GFP_HIGHUSER), | ||
2275 | gfp_zone(GFP_HIGHUSER), | ||
2276 | &pol->v.nodes, &zone); | ||
2277 | polnid = zone->node; | ||
2278 | break; | ||
2279 | |||
2280 | default: | ||
2281 | BUG(); | ||
2282 | } | ||
2283 | |||
2284 | /* Migrate the page towards the node whose CPU is referencing it */ | ||
2285 | if (pol->flags & MPOL_F_MORON) { | ||
2286 | int last_nid; | ||
2287 | |||
2288 | polnid = numa_node_id(); | ||
2289 | |||
2290 | /* | ||
2291 | * Multi-stage node selection is used in conjunction | ||
2292 | * with a periodic migration fault to build a temporal | ||
2293 | * task<->page relation. By using a two-stage filter we | ||
2294 | * remove short/unlikely relations. | ||
2295 | * | ||
2296 | * Using P(p) ~ n_p / n_t as per frequentist | ||
2297 | * probability, we can equate a task's usage of a | ||
2298 | * particular page (n_p) per total usage of this | ||
2299 | * page (n_t) (in a given time-span) to a probability. | ||
2300 | * | ||
2301 | * Our periodic faults will sample this probability and | ||
2302 | * getting the same result twice in a row, given these | ||
2303 | * samples are fully independent, is then given by | ||
2304 | * P(n)^2, provided our sample period is sufficiently | ||
2305 | * short compared to the usage pattern. | ||
2306 | * | ||
2307 | * This quadric squishes small probabilities, making | ||
2308 | * it less likely we act on an unlikely task<->page | ||
2309 | * relation. | ||
2310 | */ | ||
2311 | last_nid = page_xchg_last_nid(page, polnid); | ||
2312 | if (last_nid != polnid) | ||
2313 | goto out; | ||
2314 | } | ||
2315 | |||
2316 | if (curnid != polnid) | ||
2317 | ret = polnid; | ||
2318 | out: | ||
2319 | mpol_cond_put(pol); | ||
2320 | |||
2321 | return ret; | ||
2322 | } | ||
2323 | |||
2178 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2324 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2179 | { | 2325 | { |
2180 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2326 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
@@ -2182,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) | |||
2182 | sp_free(n); | 2328 | sp_free(n); |
2183 | } | 2329 | } |
2184 | 2330 | ||
2331 | static void sp_node_init(struct sp_node *node, unsigned long start, | ||
2332 | unsigned long end, struct mempolicy *pol) | ||
2333 | { | ||
2334 | node->start = start; | ||
2335 | node->end = end; | ||
2336 | node->policy = pol; | ||
2337 | } | ||
2338 | |||
2185 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | 2339 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
2186 | struct mempolicy *pol) | 2340 | struct mempolicy *pol) |
2187 | { | 2341 | { |
@@ -2198,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | |||
2198 | return NULL; | 2352 | return NULL; |
2199 | } | 2353 | } |
2200 | newpol->flags |= MPOL_F_SHARED; | 2354 | newpol->flags |= MPOL_F_SHARED; |
2201 | 2355 | sp_node_init(n, start, end, newpol); | |
2202 | n->start = start; | ||
2203 | n->end = end; | ||
2204 | n->policy = newpol; | ||
2205 | 2356 | ||
2206 | return n; | 2357 | return n; |
2207 | } | 2358 | } |
@@ -2211,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | |||
2211 | unsigned long end, struct sp_node *new) | 2362 | unsigned long end, struct sp_node *new) |
2212 | { | 2363 | { |
2213 | struct sp_node *n; | 2364 | struct sp_node *n; |
2365 | struct sp_node *n_new = NULL; | ||
2366 | struct mempolicy *mpol_new = NULL; | ||
2214 | int ret = 0; | 2367 | int ret = 0; |
2215 | 2368 | ||
2216 | mutex_lock(&sp->mutex); | 2369 | restart: |
2370 | spin_lock(&sp->lock); | ||
2217 | n = sp_lookup(sp, start, end); | 2371 | n = sp_lookup(sp, start, end); |
2218 | /* Take care of old policies in the same range. */ | 2372 | /* Take care of old policies in the same range. */ |
2219 | while (n && n->start < end) { | 2373 | while (n && n->start < end) { |
@@ -2226,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | |||
2226 | } else { | 2380 | } else { |
2227 | /* Old policy spanning whole new range. */ | 2381 | /* Old policy spanning whole new range. */ |
2228 | if (n->end > end) { | 2382 | if (n->end > end) { |
2229 | struct sp_node *new2; | 2383 | if (!n_new) |
2230 | new2 = sp_alloc(end, n->end, n->policy); | 2384 | goto alloc_new; |
2231 | if (!new2) { | 2385 | |
2232 | ret = -ENOMEM; | 2386 | *mpol_new = *n->policy; |
2233 | goto out; | 2387 | atomic_set(&mpol_new->refcnt, 1); |
2234 | } | 2388 | sp_node_init(n_new, n->end, end, mpol_new); |
2389 | sp_insert(sp, n_new); | ||
2235 | n->end = start; | 2390 | n->end = start; |
2236 | sp_insert(sp, new2); | 2391 | n_new = NULL; |
2392 | mpol_new = NULL; | ||
2237 | break; | 2393 | break; |
2238 | } else | 2394 | } else |
2239 | n->end = start; | 2395 | n->end = start; |
@@ -2244,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | |||
2244 | } | 2400 | } |
2245 | if (new) | 2401 | if (new) |
2246 | sp_insert(sp, new); | 2402 | sp_insert(sp, new); |
2247 | out: | 2403 | spin_unlock(&sp->lock); |
2248 | mutex_unlock(&sp->mutex); | 2404 | ret = 0; |
2405 | |||
2406 | err_out: | ||
2407 | if (mpol_new) | ||
2408 | mpol_put(mpol_new); | ||
2409 | if (n_new) | ||
2410 | kmem_cache_free(sn_cache, n_new); | ||
2411 | |||
2249 | return ret; | 2412 | return ret; |
2413 | |||
2414 | alloc_new: | ||
2415 | spin_unlock(&sp->lock); | ||
2416 | ret = -ENOMEM; | ||
2417 | n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); | ||
2418 | if (!n_new) | ||
2419 | goto err_out; | ||
2420 | mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); | ||
2421 | if (!mpol_new) | ||
2422 | goto err_out; | ||
2423 | goto restart; | ||
2250 | } | 2424 | } |
2251 | 2425 | ||
2252 | /** | 2426 | /** |
@@ -2264,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
2264 | int ret; | 2438 | int ret; |
2265 | 2439 | ||
2266 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 2440 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
2267 | mutex_init(&sp->mutex); | 2441 | spin_lock_init(&sp->lock); |
2268 | 2442 | ||
2269 | if (mpol) { | 2443 | if (mpol) { |
2270 | struct vm_area_struct pvma; | 2444 | struct vm_area_struct pvma; |
@@ -2330,16 +2504,60 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2330 | 2504 | ||
2331 | if (!p->root.rb_node) | 2505 | if (!p->root.rb_node) |
2332 | return; | 2506 | return; |
2333 | mutex_lock(&p->mutex); | 2507 | spin_lock(&p->lock); |
2334 | next = rb_first(&p->root); | 2508 | next = rb_first(&p->root); |
2335 | while (next) { | 2509 | while (next) { |
2336 | n = rb_entry(next, struct sp_node, nd); | 2510 | n = rb_entry(next, struct sp_node, nd); |
2337 | next = rb_next(&n->nd); | 2511 | next = rb_next(&n->nd); |
2338 | sp_delete(p, n); | 2512 | sp_delete(p, n); |
2339 | } | 2513 | } |
2340 | mutex_unlock(&p->mutex); | 2514 | spin_unlock(&p->lock); |
2515 | } | ||
2516 | |||
2517 | #ifdef CONFIG_NUMA_BALANCING | ||
2518 | static bool __initdata numabalancing_override; | ||
2519 | |||
2520 | static void __init check_numabalancing_enable(void) | ||
2521 | { | ||
2522 | bool numabalancing_default = false; | ||
2523 | |||
2524 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) | ||
2525 | numabalancing_default = true; | ||
2526 | |||
2527 | if (nr_node_ids > 1 && !numabalancing_override) { | ||
2528 | printk(KERN_INFO "Enabling automatic NUMA balancing. " | ||
2529 | "Configure with numa_balancing= or sysctl"); | ||
2530 | set_numabalancing_state(numabalancing_default); | ||
2531 | } | ||
2341 | } | 2532 | } |
2342 | 2533 | ||
2534 | static int __init setup_numabalancing(char *str) | ||
2535 | { | ||
2536 | int ret = 0; | ||
2537 | if (!str) | ||
2538 | goto out; | ||
2539 | numabalancing_override = true; | ||
2540 | |||
2541 | if (!strcmp(str, "enable")) { | ||
2542 | set_numabalancing_state(true); | ||
2543 | ret = 1; | ||
2544 | } else if (!strcmp(str, "disable")) { | ||
2545 | set_numabalancing_state(false); | ||
2546 | ret = 1; | ||
2547 | } | ||
2548 | out: | ||
2549 | if (!ret) | ||
2550 | printk(KERN_WARNING "Unable to parse numa_balancing=\n"); | ||
2551 | |||
2552 | return ret; | ||
2553 | } | ||
2554 | __setup("numa_balancing=", setup_numabalancing); | ||
2555 | #else | ||
2556 | static inline void __init check_numabalancing_enable(void) | ||
2557 | { | ||
2558 | } | ||
2559 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2560 | |||
2343 | /* assumes fs == KERNEL_DS */ | 2561 | /* assumes fs == KERNEL_DS */ |
2344 | void __init numa_policy_init(void) | 2562 | void __init numa_policy_init(void) |
2345 | { | 2563 | { |
@@ -2355,13 +2573,22 @@ void __init numa_policy_init(void) | |||
2355 | sizeof(struct sp_node), | 2573 | sizeof(struct sp_node), |
2356 | 0, SLAB_PANIC, NULL); | 2574 | 0, SLAB_PANIC, NULL); |
2357 | 2575 | ||
2576 | for_each_node(nid) { | ||
2577 | preferred_node_policy[nid] = (struct mempolicy) { | ||
2578 | .refcnt = ATOMIC_INIT(1), | ||
2579 | .mode = MPOL_PREFERRED, | ||
2580 | .flags = MPOL_F_MOF | MPOL_F_MORON, | ||
2581 | .v = { .preferred_node = nid, }, | ||
2582 | }; | ||
2583 | } | ||
2584 | |||
2358 | /* | 2585 | /* |
2359 | * Set interleaving policy for system init. Interleaving is only | 2586 | * Set interleaving policy for system init. Interleaving is only |
2360 | * enabled across suitably sized nodes (default is >= 16MB), or | 2587 | * enabled across suitably sized nodes (default is >= 16MB), or |
2361 | * fall back to the largest node if they're all smaller. | 2588 | * fall back to the largest node if they're all smaller. |
2362 | */ | 2589 | */ |
2363 | nodes_clear(interleave_nodes); | 2590 | nodes_clear(interleave_nodes); |
2364 | for_each_node_state(nid, N_HIGH_MEMORY) { | 2591 | for_each_node_state(nid, N_MEMORY) { |
2365 | unsigned long total_pages = node_present_pages(nid); | 2592 | unsigned long total_pages = node_present_pages(nid); |
2366 | 2593 | ||
2367 | /* Preserve the largest node */ | 2594 | /* Preserve the largest node */ |
@@ -2381,6 +2608,8 @@ void __init numa_policy_init(void) | |||
2381 | 2608 | ||
2382 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) | 2609 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) |
2383 | printk("numa_policy_init: interleaving failed\n"); | 2610 | printk("numa_policy_init: interleaving failed\n"); |
2611 | |||
2612 | check_numabalancing_enable(); | ||
2384 | } | 2613 | } |
2385 | 2614 | ||
2386 | /* Reset policy of current process to default */ | 2615 | /* Reset policy of current process to default */ |
@@ -2394,44 +2623,34 @@ void numa_default_policy(void) | |||
2394 | */ | 2623 | */ |
2395 | 2624 | ||
2396 | /* | 2625 | /* |
2397 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag | 2626 | * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. |
2398 | * Used only for mpol_parse_str() and mpol_to_str() | ||
2399 | */ | 2627 | */ |
2400 | #define MPOL_LOCAL MPOL_MAX | ||
2401 | static const char * const policy_modes[] = | 2628 | static const char * const policy_modes[] = |
2402 | { | 2629 | { |
2403 | [MPOL_DEFAULT] = "default", | 2630 | [MPOL_DEFAULT] = "default", |
2404 | [MPOL_PREFERRED] = "prefer", | 2631 | [MPOL_PREFERRED] = "prefer", |
2405 | [MPOL_BIND] = "bind", | 2632 | [MPOL_BIND] = "bind", |
2406 | [MPOL_INTERLEAVE] = "interleave", | 2633 | [MPOL_INTERLEAVE] = "interleave", |
2407 | [MPOL_LOCAL] = "local" | 2634 | [MPOL_LOCAL] = "local", |
2408 | }; | 2635 | }; |
2409 | 2636 | ||
2410 | 2637 | ||
2411 | #ifdef CONFIG_TMPFS | 2638 | #ifdef CONFIG_TMPFS |
2412 | /** | 2639 | /** |
2413 | * mpol_parse_str - parse string to mempolicy | 2640 | * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. |
2414 | * @str: string containing mempolicy to parse | 2641 | * @str: string containing mempolicy to parse |
2415 | * @mpol: pointer to struct mempolicy pointer, returned on success. | 2642 | * @mpol: pointer to struct mempolicy pointer, returned on success. |
2416 | * @no_context: flag whether to "contextualize" the mempolicy | ||
2417 | * | 2643 | * |
2418 | * Format of input: | 2644 | * Format of input: |
2419 | * <mode>[=<flags>][:<nodelist>] | 2645 | * <mode>[=<flags>][:<nodelist>] |
2420 | * | 2646 | * |
2421 | * if @no_context is true, save the input nodemask in w.user_nodemask in | ||
2422 | * the returned mempolicy. This will be used to "clone" the mempolicy in | ||
2423 | * a specific context [cpuset] at a later time. Used to parse tmpfs mpol | ||
2424 | * mount option. Note that if 'static' or 'relative' mode flags were | ||
2425 | * specified, the input nodemask will already have been saved. Saving | ||
2426 | * it again is redundant, but safe. | ||
2427 | * | ||
2428 | * On success, returns 0, else 1 | 2647 | * On success, returns 0, else 1 |
2429 | */ | 2648 | */ |
2430 | int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | 2649 | int mpol_parse_str(char *str, struct mempolicy **mpol) |
2431 | { | 2650 | { |
2432 | struct mempolicy *new = NULL; | 2651 | struct mempolicy *new = NULL; |
2433 | unsigned short mode; | 2652 | unsigned short mode; |
2434 | unsigned short uninitialized_var(mode_flags); | 2653 | unsigned short mode_flags; |
2435 | nodemask_t nodes; | 2654 | nodemask_t nodes; |
2436 | char *nodelist = strchr(str, ':'); | 2655 | char *nodelist = strchr(str, ':'); |
2437 | char *flags = strchr(str, '='); | 2656 | char *flags = strchr(str, '='); |
@@ -2442,7 +2661,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2442 | *nodelist++ = '\0'; | 2661 | *nodelist++ = '\0'; |
2443 | if (nodelist_parse(nodelist, nodes)) | 2662 | if (nodelist_parse(nodelist, nodes)) |
2444 | goto out; | 2663 | goto out; |
2445 | if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) | 2664 | if (!nodes_subset(nodes, node_states[N_MEMORY])) |
2446 | goto out; | 2665 | goto out; |
2447 | } else | 2666 | } else |
2448 | nodes_clear(nodes); | 2667 | nodes_clear(nodes); |
@@ -2450,12 +2669,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2450 | if (flags) | 2669 | if (flags) |
2451 | *flags++ = '\0'; /* terminate mode string */ | 2670 | *flags++ = '\0'; /* terminate mode string */ |
2452 | 2671 | ||
2453 | for (mode = 0; mode <= MPOL_LOCAL; mode++) { | 2672 | for (mode = 0; mode < MPOL_MAX; mode++) { |
2454 | if (!strcmp(str, policy_modes[mode])) { | 2673 | if (!strcmp(str, policy_modes[mode])) { |
2455 | break; | 2674 | break; |
2456 | } | 2675 | } |
2457 | } | 2676 | } |
2458 | if (mode > MPOL_LOCAL) | 2677 | if (mode >= MPOL_MAX) |
2459 | goto out; | 2678 | goto out; |
2460 | 2679 | ||
2461 | switch (mode) { | 2680 | switch (mode) { |
@@ -2476,7 +2695,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2476 | * Default to online nodes with memory if no nodelist | 2695 | * Default to online nodes with memory if no nodelist |
2477 | */ | 2696 | */ |
2478 | if (!nodelist) | 2697 | if (!nodelist) |
2479 | nodes = node_states[N_HIGH_MEMORY]; | 2698 | nodes = node_states[N_MEMORY]; |
2480 | break; | 2699 | break; |
2481 | case MPOL_LOCAL: | 2700 | case MPOL_LOCAL: |
2482 | /* | 2701 | /* |
@@ -2519,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2519 | if (IS_ERR(new)) | 2738 | if (IS_ERR(new)) |
2520 | goto out; | 2739 | goto out; |
2521 | 2740 | ||
2522 | if (no_context) { | 2741 | /* |
2523 | /* save for contextualization */ | 2742 | * Save nodes for mpol_to_str() to show the tmpfs mount options |
2524 | new->w.user_nodemask = nodes; | 2743 | * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. |
2525 | } else { | 2744 | */ |
2526 | int ret; | 2745 | if (mode != MPOL_PREFERRED) |
2527 | NODEMASK_SCRATCH(scratch); | 2746 | new->v.nodes = nodes; |
2528 | if (scratch) { | 2747 | else if (nodelist) |
2529 | task_lock(current); | 2748 | new->v.preferred_node = first_node(nodes); |
2530 | ret = mpol_set_nodemask(new, &nodes, scratch); | 2749 | else |
2531 | task_unlock(current); | 2750 | new->flags |= MPOL_F_LOCAL; |
2532 | } else | 2751 | |
2533 | ret = -ENOMEM; | 2752 | /* |
2534 | NODEMASK_SCRATCH_FREE(scratch); | 2753 | * Save nodes for contextualization: this will be used to "clone" |
2535 | if (ret) { | 2754 | * the mempolicy in a specific context [cpuset] at a later time. |
2536 | mpol_put(new); | 2755 | */ |
2537 | goto out; | 2756 | new->w.user_nodemask = nodes; |
2538 | } | 2757 | |
2539 | } | ||
2540 | err = 0; | 2758 | err = 0; |
2541 | 2759 | ||
2542 | out: | 2760 | out: |
@@ -2556,13 +2774,12 @@ out: | |||
2556 | * @buffer: to contain formatted mempolicy string | 2774 | * @buffer: to contain formatted mempolicy string |
2557 | * @maxlen: length of @buffer | 2775 | * @maxlen: length of @buffer |
2558 | * @pol: pointer to mempolicy to be formatted | 2776 | * @pol: pointer to mempolicy to be formatted |
2559 | * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask | ||
2560 | * | 2777 | * |
2561 | * Convert a mempolicy into a string. | 2778 | * Convert a mempolicy into a string. |
2562 | * Returns the number of characters in buffer (if positive) | 2779 | * Returns the number of characters in buffer (if positive) |
2563 | * or an error (negative) | 2780 | * or an error (negative) |
2564 | */ | 2781 | */ |
2565 | int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) | 2782 | int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) |
2566 | { | 2783 | { |
2567 | char *p = buffer; | 2784 | char *p = buffer; |
2568 | int l; | 2785 | int l; |
@@ -2588,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) | |||
2588 | case MPOL_PREFERRED: | 2805 | case MPOL_PREFERRED: |
2589 | nodes_clear(nodes); | 2806 | nodes_clear(nodes); |
2590 | if (flags & MPOL_F_LOCAL) | 2807 | if (flags & MPOL_F_LOCAL) |
2591 | mode = MPOL_LOCAL; /* pseudo-policy */ | 2808 | mode = MPOL_LOCAL; |
2592 | else | 2809 | else |
2593 | node_set(pol->v.preferred_node, nodes); | 2810 | node_set(pol->v.preferred_node, nodes); |
2594 | break; | 2811 | break; |
@@ -2596,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) | |||
2596 | case MPOL_BIND: | 2813 | case MPOL_BIND: |
2597 | /* Fall through */ | 2814 | /* Fall through */ |
2598 | case MPOL_INTERLEAVE: | 2815 | case MPOL_INTERLEAVE: |
2599 | if (no_context) | 2816 | nodes = pol->v.nodes; |
2600 | nodes = pol->w.user_nodemask; | ||
2601 | else | ||
2602 | nodes = pol->v.nodes; | ||
2603 | break; | 2817 | break; |
2604 | 2818 | ||
2605 | default: | 2819 | default: |
diff --git a/mm/migrate.c b/mm/migrate.c index 77ed2d773705..c38778610aa8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -35,9 +35,13 @@ | |||
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | 36 | #include <linux/hugetlb_cgroup.h> |
37 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
38 | #include <linux/balloon_compaction.h> | ||
38 | 39 | ||
39 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
40 | 41 | ||
42 | #define CREATE_TRACE_POINTS | ||
43 | #include <trace/events/migrate.h> | ||
44 | |||
41 | #include "internal.h" | 45 | #include "internal.h" |
42 | 46 | ||
43 | /* | 47 | /* |
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l) | |||
79 | list_del(&page->lru); | 83 | list_del(&page->lru); |
80 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 84 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
81 | page_is_file_cache(page)); | 85 | page_is_file_cache(page)); |
82 | putback_lru_page(page); | 86 | putback_lru_page(page); |
87 | } | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * Put previously isolated pages back onto the appropriate lists | ||
92 | * from where they were once taken off for compaction/migration. | ||
93 | * | ||
94 | * This function shall be used instead of putback_lru_pages(), | ||
95 | * whenever the isolated pageset has been built by isolate_migratepages_range() | ||
96 | */ | ||
97 | void putback_movable_pages(struct list_head *l) | ||
98 | { | ||
99 | struct page *page; | ||
100 | struct page *page2; | ||
101 | |||
102 | list_for_each_entry_safe(page, page2, l, lru) { | ||
103 | list_del(&page->lru); | ||
104 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
105 | page_is_file_cache(page)); | ||
106 | if (unlikely(balloon_page_movable(page))) | ||
107 | balloon_page_putback(page); | ||
108 | else | ||
109 | putback_lru_page(page); | ||
83 | } | 110 | } |
84 | } | 111 | } |
85 | 112 | ||
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
91 | { | 118 | { |
92 | struct mm_struct *mm = vma->vm_mm; | 119 | struct mm_struct *mm = vma->vm_mm; |
93 | swp_entry_t entry; | 120 | swp_entry_t entry; |
94 | pgd_t *pgd; | ||
95 | pud_t *pud; | ||
96 | pmd_t *pmd; | 121 | pmd_t *pmd; |
97 | pte_t *ptep, pte; | 122 | pte_t *ptep, pte; |
98 | spinlock_t *ptl; | 123 | spinlock_t *ptl; |
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
103 | goto out; | 128 | goto out; |
104 | ptl = &mm->page_table_lock; | 129 | ptl = &mm->page_table_lock; |
105 | } else { | 130 | } else { |
106 | pgd = pgd_offset(mm, addr); | 131 | pmd = mm_find_pmd(mm, addr); |
107 | if (!pgd_present(*pgd)) | 132 | if (!pmd) |
108 | goto out; | ||
109 | |||
110 | pud = pud_offset(pgd, addr); | ||
111 | if (!pud_present(*pud)) | ||
112 | goto out; | 133 | goto out; |
113 | |||
114 | pmd = pmd_offset(pud, addr); | ||
115 | if (pmd_trans_huge(*pmd)) | 134 | if (pmd_trans_huge(*pmd)) |
116 | goto out; | 135 | goto out; |
117 | if (!pmd_present(*pmd)) | ||
118 | goto out; | ||
119 | 136 | ||
120 | ptep = pte_offset_map(pmd, addr); | 137 | ptep = pte_offset_map(pmd, addr); |
121 | 138 | ||
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
279 | struct page *newpage, struct page *page, | 296 | struct page *newpage, struct page *page, |
280 | struct buffer_head *head, enum migrate_mode mode) | 297 | struct buffer_head *head, enum migrate_mode mode) |
281 | { | 298 | { |
282 | int expected_count; | 299 | int expected_count = 0; |
283 | void **pslot; | 300 | void **pslot; |
284 | 301 | ||
285 | if (!mapping) { | 302 | if (!mapping) { |
286 | /* Anonymous page without mapping */ | 303 | /* Anonymous page without mapping */ |
287 | if (page_count(page) != 1) | 304 | if (page_count(page) != 1) |
288 | return -EAGAIN; | 305 | return -EAGAIN; |
289 | return 0; | 306 | return MIGRATEPAGE_SUCCESS; |
290 | } | 307 | } |
291 | 308 | ||
292 | spin_lock_irq(&mapping->tree_lock); | 309 | spin_lock_irq(&mapping->tree_lock); |
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
356 | } | 373 | } |
357 | spin_unlock_irq(&mapping->tree_lock); | 374 | spin_unlock_irq(&mapping->tree_lock); |
358 | 375 | ||
359 | return 0; | 376 | return MIGRATEPAGE_SUCCESS; |
360 | } | 377 | } |
361 | 378 | ||
362 | /* | 379 | /* |
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
372 | if (!mapping) { | 389 | if (!mapping) { |
373 | if (page_count(page) != 1) | 390 | if (page_count(page) != 1) |
374 | return -EAGAIN; | 391 | return -EAGAIN; |
375 | return 0; | 392 | return MIGRATEPAGE_SUCCESS; |
376 | } | 393 | } |
377 | 394 | ||
378 | spin_lock_irq(&mapping->tree_lock); | 395 | spin_lock_irq(&mapping->tree_lock); |
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
399 | page_unfreeze_refs(page, expected_count - 1); | 416 | page_unfreeze_refs(page, expected_count - 1); |
400 | 417 | ||
401 | spin_unlock_irq(&mapping->tree_lock); | 418 | spin_unlock_irq(&mapping->tree_lock); |
402 | return 0; | 419 | return MIGRATEPAGE_SUCCESS; |
403 | } | 420 | } |
404 | 421 | ||
405 | /* | 422 | /* |
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
407 | */ | 424 | */ |
408 | void migrate_page_copy(struct page *newpage, struct page *page) | 425 | void migrate_page_copy(struct page *newpage, struct page *page) |
409 | { | 426 | { |
410 | if (PageHuge(page)) | 427 | if (PageHuge(page) || PageTransHuge(page)) |
411 | copy_huge_page(newpage, page); | 428 | copy_huge_page(newpage, page); |
412 | else | 429 | else |
413 | copy_highpage(newpage, page); | 430 | copy_highpage(newpage, page); |
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping, | |||
486 | 503 | ||
487 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); | 504 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); |
488 | 505 | ||
489 | if (rc) | 506 | if (rc != MIGRATEPAGE_SUCCESS) |
490 | return rc; | 507 | return rc; |
491 | 508 | ||
492 | migrate_page_copy(newpage, page); | 509 | migrate_page_copy(newpage, page); |
493 | return 0; | 510 | return MIGRATEPAGE_SUCCESS; |
494 | } | 511 | } |
495 | EXPORT_SYMBOL(migrate_page); | 512 | EXPORT_SYMBOL(migrate_page); |
496 | 513 | ||
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping, | |||
513 | 530 | ||
514 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); | 531 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); |
515 | 532 | ||
516 | if (rc) | 533 | if (rc != MIGRATEPAGE_SUCCESS) |
517 | return rc; | 534 | return rc; |
518 | 535 | ||
519 | /* | 536 | /* |
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping, | |||
549 | 566 | ||
550 | } while (bh != head); | 567 | } while (bh != head); |
551 | 568 | ||
552 | return 0; | 569 | return MIGRATEPAGE_SUCCESS; |
553 | } | 570 | } |
554 | EXPORT_SYMBOL(buffer_migrate_page); | 571 | EXPORT_SYMBOL(buffer_migrate_page); |
555 | #endif | 572 | #endif |
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
628 | * | 645 | * |
629 | * Return value: | 646 | * Return value: |
630 | * < 0 - error code | 647 | * < 0 - error code |
631 | * == 0 - success | 648 | * MIGRATEPAGE_SUCCESS - success |
632 | */ | 649 | */ |
633 | static int move_to_new_page(struct page *newpage, struct page *page, | 650 | static int move_to_new_page(struct page *newpage, struct page *page, |
634 | int remap_swapcache, enum migrate_mode mode) | 651 | int remap_swapcache, enum migrate_mode mode) |
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
665 | else | 682 | else |
666 | rc = fallback_migrate_page(mapping, newpage, page, mode); | 683 | rc = fallback_migrate_page(mapping, newpage, page, mode); |
667 | 684 | ||
668 | if (rc) { | 685 | if (rc != MIGRATEPAGE_SUCCESS) { |
669 | newpage->mapping = NULL; | 686 | newpage->mapping = NULL; |
670 | } else { | 687 | } else { |
671 | if (remap_swapcache) | 688 | if (remap_swapcache) |
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
751 | */ | 768 | */ |
752 | if (PageAnon(page)) { | 769 | if (PageAnon(page)) { |
753 | /* | 770 | /* |
754 | * Only page_lock_anon_vma() understands the subtleties of | 771 | * Only page_lock_anon_vma_read() understands the subtleties of |
755 | * getting a hold on an anon_vma from outside one of its mms. | 772 | * getting a hold on an anon_vma from outside one of its mms. |
756 | */ | 773 | */ |
757 | anon_vma = page_get_anon_vma(page); | 774 | anon_vma = page_get_anon_vma(page); |
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
778 | } | 795 | } |
779 | } | 796 | } |
780 | 797 | ||
798 | if (unlikely(balloon_page_movable(page))) { | ||
799 | /* | ||
800 | * A ballooned page does not need any special attention from | ||
801 | * physical to virtual reverse mapping procedures. | ||
802 | * Skip any attempt to unmap PTEs or to remap swap cache, | ||
803 | * in order to avoid burning cycles at rmap level, and perform | ||
804 | * the page migration right away (proteced by page lock). | ||
805 | */ | ||
806 | rc = balloon_page_migrate(newpage, page, mode); | ||
807 | goto uncharge; | ||
808 | } | ||
809 | |||
781 | /* | 810 | /* |
782 | * Corner case handling: | 811 | * Corner case handling: |
783 | * 1. When a new swap-cache page is read into, it is added to the LRU | 812 | * 1. When a new swap-cache page is read into, it is added to the LRU |
@@ -814,7 +843,9 @@ skip_unmap: | |||
814 | put_anon_vma(anon_vma); | 843 | put_anon_vma(anon_vma); |
815 | 844 | ||
816 | uncharge: | 845 | uncharge: |
817 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | 846 | mem_cgroup_end_migration(mem, page, newpage, |
847 | (rc == MIGRATEPAGE_SUCCESS || | ||
848 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); | ||
818 | unlock: | 849 | unlock: |
819 | unlock_page(page); | 850 | unlock_page(page); |
820 | out: | 851 | out: |
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
846 | goto out; | 877 | goto out; |
847 | 878 | ||
848 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | 879 | rc = __unmap_and_move(page, newpage, force, offlining, mode); |
880 | |||
881 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | ||
882 | /* | ||
883 | * A ballooned page has been migrated already. | ||
884 | * Now, it's the time to wrap-up counters, | ||
885 | * handle the page back to Buddy and return. | ||
886 | */ | ||
887 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
888 | page_is_file_cache(page)); | ||
889 | balloon_page_free(page); | ||
890 | return MIGRATEPAGE_SUCCESS; | ||
891 | } | ||
849 | out: | 892 | out: |
850 | if (rc != -EAGAIN) { | 893 | if (rc != -EAGAIN) { |
851 | /* | 894 | /* |
@@ -958,10 +1001,11 @@ out: | |||
958 | */ | 1001 | */ |
959 | int migrate_pages(struct list_head *from, | 1002 | int migrate_pages(struct list_head *from, |
960 | new_page_t get_new_page, unsigned long private, bool offlining, | 1003 | new_page_t get_new_page, unsigned long private, bool offlining, |
961 | enum migrate_mode mode) | 1004 | enum migrate_mode mode, int reason) |
962 | { | 1005 | { |
963 | int retry = 1; | 1006 | int retry = 1; |
964 | int nr_failed = 0; | 1007 | int nr_failed = 0; |
1008 | int nr_succeeded = 0; | ||
965 | int pass = 0; | 1009 | int pass = 0; |
966 | struct page *page; | 1010 | struct page *page; |
967 | struct page *page2; | 1011 | struct page *page2; |
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from, | |||
987 | case -EAGAIN: | 1031 | case -EAGAIN: |
988 | retry++; | 1032 | retry++; |
989 | break; | 1033 | break; |
990 | case 0: | 1034 | case MIGRATEPAGE_SUCCESS: |
1035 | nr_succeeded++; | ||
991 | break; | 1036 | break; |
992 | default: | 1037 | default: |
993 | /* Permanent failure */ | 1038 | /* Permanent failure */ |
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from, | |||
996 | } | 1041 | } |
997 | } | 1042 | } |
998 | } | 1043 | } |
999 | rc = 0; | 1044 | rc = nr_failed + retry; |
1000 | out: | 1045 | out: |
1046 | if (nr_succeeded) | ||
1047 | count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); | ||
1048 | if (nr_failed) | ||
1049 | count_vm_events(PGMIGRATE_FAIL, nr_failed); | ||
1050 | trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); | ||
1051 | |||
1001 | if (!swapwrite) | 1052 | if (!swapwrite) |
1002 | current->flags &= ~PF_SWAPWRITE; | 1053 | current->flags &= ~PF_SWAPWRITE; |
1003 | 1054 | ||
1004 | if (rc) | 1055 | return rc; |
1005 | return rc; | ||
1006 | |||
1007 | return nr_failed + retry; | ||
1008 | } | 1056 | } |
1009 | 1057 | ||
1010 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | 1058 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | |||
1024 | /* try again */ | 1072 | /* try again */ |
1025 | cond_resched(); | 1073 | cond_resched(); |
1026 | break; | 1074 | break; |
1027 | case 0: | 1075 | case MIGRATEPAGE_SUCCESS: |
1028 | goto out; | 1076 | goto out; |
1029 | default: | 1077 | default: |
1030 | rc = -EIO; | 1078 | rc = -EIO; |
@@ -1139,7 +1187,8 @@ set_status: | |||
1139 | err = 0; | 1187 | err = 0; |
1140 | if (!list_empty(&pagelist)) { | 1188 | if (!list_empty(&pagelist)) { |
1141 | err = migrate_pages(&pagelist, new_page_node, | 1189 | err = migrate_pages(&pagelist, new_page_node, |
1142 | (unsigned long)pm, 0, MIGRATE_SYNC); | 1190 | (unsigned long)pm, 0, MIGRATE_SYNC, |
1191 | MR_SYSCALL); | ||
1143 | if (err) | 1192 | if (err) |
1144 | putback_lru_pages(&pagelist); | 1193 | putback_lru_pages(&pagelist); |
1145 | } | 1194 | } |
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, | |||
1201 | if (node < 0 || node >= MAX_NUMNODES) | 1250 | if (node < 0 || node >= MAX_NUMNODES) |
1202 | goto out_pm; | 1251 | goto out_pm; |
1203 | 1252 | ||
1204 | if (!node_state(node, N_HIGH_MEMORY)) | 1253 | if (!node_state(node, N_MEMORY)) |
1205 | goto out_pm; | 1254 | goto out_pm; |
1206 | 1255 | ||
1207 | err = -EACCES; | 1256 | err = -EACCES; |
@@ -1403,4 +1452,329 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1403 | } | 1452 | } |
1404 | return err; | 1453 | return err; |
1405 | } | 1454 | } |
1406 | #endif | 1455 | |
1456 | #ifdef CONFIG_NUMA_BALANCING | ||
1457 | /* | ||
1458 | * Returns true if this is a safe migration target node for misplaced NUMA | ||
1459 | * pages. Currently it only checks the watermarks which crude | ||
1460 | */ | ||
1461 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | ||
1462 | int nr_migrate_pages) | ||
1463 | { | ||
1464 | int z; | ||
1465 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | ||
1466 | struct zone *zone = pgdat->node_zones + z; | ||
1467 | |||
1468 | if (!populated_zone(zone)) | ||
1469 | continue; | ||
1470 | |||
1471 | if (zone->all_unreclaimable) | ||
1472 | continue; | ||
1473 | |||
1474 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ | ||
1475 | if (!zone_watermark_ok(zone, 0, | ||
1476 | high_wmark_pages(zone) + | ||
1477 | nr_migrate_pages, | ||
1478 | 0, 0)) | ||
1479 | continue; | ||
1480 | return true; | ||
1481 | } | ||
1482 | return false; | ||
1483 | } | ||
1484 | |||
1485 | static struct page *alloc_misplaced_dst_page(struct page *page, | ||
1486 | unsigned long data, | ||
1487 | int **result) | ||
1488 | { | ||
1489 | int nid = (int) data; | ||
1490 | struct page *newpage; | ||
1491 | |||
1492 | newpage = alloc_pages_exact_node(nid, | ||
1493 | (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | | ||
1494 | __GFP_NOMEMALLOC | __GFP_NORETRY | | ||
1495 | __GFP_NOWARN) & | ||
1496 | ~GFP_IOFS, 0); | ||
1497 | if (newpage) | ||
1498 | page_xchg_last_nid(newpage, page_last_nid(page)); | ||
1499 | |||
1500 | return newpage; | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * page migration rate limiting control. | ||
1505 | * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs | ||
1506 | * window of time. Default here says do not migrate more than 1280M per second. | ||
1507 | * If a node is rate-limited then PTE NUMA updates are also rate-limited. However | ||
1508 | * as it is faults that reset the window, pte updates will happen unconditionally | ||
1509 | * if there has not been a fault since @pteupdate_interval_millisecs after the | ||
1510 | * throttle window closed. | ||
1511 | */ | ||
1512 | static unsigned int migrate_interval_millisecs __read_mostly = 100; | ||
1513 | static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; | ||
1514 | static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); | ||
1515 | |||
1516 | /* Returns true if NUMA migration is currently rate limited */ | ||
1517 | bool migrate_ratelimited(int node) | ||
1518 | { | ||
1519 | pg_data_t *pgdat = NODE_DATA(node); | ||
1520 | |||
1521 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + | ||
1522 | msecs_to_jiffies(pteupdate_interval_millisecs))) | ||
1523 | return false; | ||
1524 | |||
1525 | if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) | ||
1526 | return false; | ||
1527 | |||
1528 | return true; | ||
1529 | } | ||
1530 | |||
1531 | /* Returns true if the node is migrate rate-limited after the update */ | ||
1532 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | ||
1533 | { | ||
1534 | bool rate_limited = false; | ||
1535 | |||
1536 | /* | ||
1537 | * Rate-limit the amount of data that is being migrated to a node. | ||
1538 | * Optimal placement is no good if the memory bus is saturated and | ||
1539 | * all the time is being spent migrating! | ||
1540 | */ | ||
1541 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1542 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | ||
1543 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
1544 | pgdat->numabalancing_migrate_next_window = jiffies + | ||
1545 | msecs_to_jiffies(migrate_interval_millisecs); | ||
1546 | } | ||
1547 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | ||
1548 | rate_limited = true; | ||
1549 | else | ||
1550 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1551 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1552 | |||
1553 | return rate_limited; | ||
1554 | } | ||
1555 | |||
1556 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | ||
1557 | { | ||
1558 | int ret = 0; | ||
1559 | |||
1560 | /* Avoid migrating to a node that is nearly full */ | ||
1561 | if (migrate_balanced_pgdat(pgdat, 1)) { | ||
1562 | int page_lru; | ||
1563 | |||
1564 | if (isolate_lru_page(page)) { | ||
1565 | put_page(page); | ||
1566 | return 0; | ||
1567 | } | ||
1568 | |||
1569 | /* Page is isolated */ | ||
1570 | ret = 1; | ||
1571 | page_lru = page_is_file_cache(page); | ||
1572 | if (!PageTransHuge(page)) | ||
1573 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | ||
1574 | else | ||
1575 | mod_zone_page_state(page_zone(page), | ||
1576 | NR_ISOLATED_ANON + page_lru, | ||
1577 | HPAGE_PMD_NR); | ||
1578 | } | ||
1579 | |||
1580 | /* | ||
1581 | * Page is either isolated or there is not enough space on the target | ||
1582 | * node. If isolated, then it has taken a reference count and the | ||
1583 | * callers reference can be safely dropped without the page | ||
1584 | * disappearing underneath us during migration. Otherwise the page is | ||
1585 | * not to be migrated but the callers reference should still be | ||
1586 | * dropped so it does not leak. | ||
1587 | */ | ||
1588 | put_page(page); | ||
1589 | |||
1590 | return ret; | ||
1591 | } | ||
1592 | |||
1593 | /* | ||
1594 | * Attempt to migrate a misplaced page to the specified destination | ||
1595 | * node. Caller is expected to have an elevated reference count on | ||
1596 | * the page that will be dropped by this function before returning. | ||
1597 | */ | ||
1598 | int migrate_misplaced_page(struct page *page, int node) | ||
1599 | { | ||
1600 | pg_data_t *pgdat = NODE_DATA(node); | ||
1601 | int isolated = 0; | ||
1602 | int nr_remaining; | ||
1603 | LIST_HEAD(migratepages); | ||
1604 | |||
1605 | /* | ||
1606 | * Don't migrate pages that are mapped in multiple processes. | ||
1607 | * TODO: Handle false sharing detection instead of this hammer | ||
1608 | */ | ||
1609 | if (page_mapcount(page) != 1) { | ||
1610 | put_page(page); | ||
1611 | goto out; | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * Rate-limit the amount of data that is being migrated to a node. | ||
1616 | * Optimal placement is no good if the memory bus is saturated and | ||
1617 | * all the time is being spent migrating! | ||
1618 | */ | ||
1619 | if (numamigrate_update_ratelimit(pgdat, 1)) { | ||
1620 | put_page(page); | ||
1621 | goto out; | ||
1622 | } | ||
1623 | |||
1624 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1625 | if (!isolated) | ||
1626 | goto out; | ||
1627 | |||
1628 | list_add(&page->lru, &migratepages); | ||
1629 | nr_remaining = migrate_pages(&migratepages, | ||
1630 | alloc_misplaced_dst_page, | ||
1631 | node, false, MIGRATE_ASYNC, | ||
1632 | MR_NUMA_MISPLACED); | ||
1633 | if (nr_remaining) { | ||
1634 | putback_lru_pages(&migratepages); | ||
1635 | isolated = 0; | ||
1636 | } else | ||
1637 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | ||
1638 | BUG_ON(!list_empty(&migratepages)); | ||
1639 | out: | ||
1640 | return isolated; | ||
1641 | } | ||
1642 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1643 | |||
1644 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
1645 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | ||
1646 | struct vm_area_struct *vma, | ||
1647 | pmd_t *pmd, pmd_t entry, | ||
1648 | unsigned long address, | ||
1649 | struct page *page, int node) | ||
1650 | { | ||
1651 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
1652 | pg_data_t *pgdat = NODE_DATA(node); | ||
1653 | int isolated = 0; | ||
1654 | struct page *new_page = NULL; | ||
1655 | struct mem_cgroup *memcg = NULL; | ||
1656 | int page_lru = page_is_file_cache(page); | ||
1657 | |||
1658 | /* | ||
1659 | * Don't migrate pages that are mapped in multiple processes. | ||
1660 | * TODO: Handle false sharing detection instead of this hammer | ||
1661 | */ | ||
1662 | if (page_mapcount(page) != 1) | ||
1663 | goto out_dropref; | ||
1664 | |||
1665 | /* | ||
1666 | * Rate-limit the amount of data that is being migrated to a node. | ||
1667 | * Optimal placement is no good if the memory bus is saturated and | ||
1668 | * all the time is being spent migrating! | ||
1669 | */ | ||
1670 | if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) | ||
1671 | goto out_dropref; | ||
1672 | |||
1673 | new_page = alloc_pages_node(node, | ||
1674 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | ||
1675 | if (!new_page) { | ||
1676 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1677 | goto out_dropref; | ||
1678 | } | ||
1679 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
1680 | |||
1681 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1682 | |||
1683 | /* | ||
1684 | * Failing to isolate or a GUP pin prevents migration. The expected | ||
1685 | * page count is 2. 1 for anonymous pages without a mapping and 1 | ||
1686 | * for the callers pin. If the page was isolated, the page will | ||
1687 | * need to be put back on the LRU. | ||
1688 | */ | ||
1689 | if (!isolated || page_count(page) != 2) { | ||
1690 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1691 | put_page(new_page); | ||
1692 | if (isolated) { | ||
1693 | putback_lru_page(page); | ||
1694 | isolated = 0; | ||
1695 | goto out; | ||
1696 | } | ||
1697 | goto out_keep_locked; | ||
1698 | } | ||
1699 | |||
1700 | /* Prepare a page as a migration target */ | ||
1701 | __set_page_locked(new_page); | ||
1702 | SetPageSwapBacked(new_page); | ||
1703 | |||
1704 | /* anon mapping, we can simply copy page->mapping to the new page: */ | ||
1705 | new_page->mapping = page->mapping; | ||
1706 | new_page->index = page->index; | ||
1707 | migrate_page_copy(new_page, page); | ||
1708 | WARN_ON(PageLRU(new_page)); | ||
1709 | |||
1710 | /* Recheck the target PMD */ | ||
1711 | spin_lock(&mm->page_table_lock); | ||
1712 | if (unlikely(!pmd_same(*pmd, entry))) { | ||
1713 | spin_unlock(&mm->page_table_lock); | ||
1714 | |||
1715 | /* Reverse changes made by migrate_page_copy() */ | ||
1716 | if (TestClearPageActive(new_page)) | ||
1717 | SetPageActive(page); | ||
1718 | if (TestClearPageUnevictable(new_page)) | ||
1719 | SetPageUnevictable(page); | ||
1720 | mlock_migrate_page(page, new_page); | ||
1721 | |||
1722 | unlock_page(new_page); | ||
1723 | put_page(new_page); /* Free it */ | ||
1724 | |||
1725 | unlock_page(page); | ||
1726 | putback_lru_page(page); | ||
1727 | |||
1728 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1729 | goto out; | ||
1730 | } | ||
1731 | |||
1732 | /* | ||
1733 | * Traditional migration needs to prepare the memcg charge | ||
1734 | * transaction early to prevent the old page from being | ||
1735 | * uncharged when installing migration entries. Here we can | ||
1736 | * save the potential rollback and start the charge transfer | ||
1737 | * only when migration is already known to end successfully. | ||
1738 | */ | ||
1739 | mem_cgroup_prepare_migration(page, new_page, &memcg); | ||
1740 | |||
1741 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
1742 | entry = pmd_mknonnuma(entry); | ||
1743 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
1744 | entry = pmd_mkhuge(entry); | ||
1745 | |||
1746 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
1747 | |||
1748 | set_pmd_at(mm, haddr, pmd, entry); | ||
1749 | update_mmu_cache_pmd(vma, address, &entry); | ||
1750 | page_remove_rmap(page); | ||
1751 | /* | ||
1752 | * Finish the charge transaction under the page table lock to | ||
1753 | * prevent split_huge_page() from dividing up the charge | ||
1754 | * before it's fully transferred to the new page. | ||
1755 | */ | ||
1756 | mem_cgroup_end_migration(memcg, page, new_page, true); | ||
1757 | spin_unlock(&mm->page_table_lock); | ||
1758 | |||
1759 | unlock_page(new_page); | ||
1760 | unlock_page(page); | ||
1761 | put_page(page); /* Drop the rmap reference */ | ||
1762 | put_page(page); /* Drop the LRU isolation reference */ | ||
1763 | |||
1764 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | ||
1765 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | ||
1766 | |||
1767 | out: | ||
1768 | mod_zone_page_state(page_zone(page), | ||
1769 | NR_ISOLATED_ANON + page_lru, | ||
1770 | -HPAGE_PMD_NR); | ||
1771 | return isolated; | ||
1772 | |||
1773 | out_dropref: | ||
1774 | put_page(page); | ||
1775 | out_keep_locked: | ||
1776 | return 0; | ||
1777 | } | ||
1778 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1779 | |||
1780 | #endif /* CONFIG_NUMA */ | ||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/khugepaged.h> | 32 | #include <linux/khugepaged.h> |
33 | #include <linux/uprobes.h> | 33 | #include <linux/uprobes.h> |
34 | #include <linux/rbtree_augmented.h> | ||
34 | 35 | ||
35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
36 | #include <asm/cacheflush.h> | 37 | #include <asm/cacheflush.h> |
@@ -89,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | |||
89 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; | 90 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; |
90 | 91 | ||
91 | /* | 92 | /* |
93 | * The global memory commitment made in the system can be a metric | ||
94 | * that can be used to drive ballooning decisions when Linux is hosted | ||
95 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically | ||
96 | * balancing memory across competing virtual machines that are hosted. | ||
97 | * Several metrics drive this policy engine including the guest reported | ||
98 | * memory commitment. | ||
99 | */ | ||
100 | unsigned long vm_memory_committed(void) | ||
101 | { | ||
102 | return percpu_counter_read_positive(&vm_committed_as); | ||
103 | } | ||
104 | EXPORT_SYMBOL_GPL(vm_memory_committed); | ||
105 | |||
106 | /* | ||
92 | * Check that a process has enough memory to allocate a new virtual | 107 | * Check that a process has enough memory to allocate a new virtual |
93 | * mapping. 0 means there is enough memory for the allocation to | 108 | * mapping. 0 means there is enough memory for the allocation to |
94 | * succeed and -ENOMEM implies there is not. | 109 | * succeed and -ENOMEM implies there is not. |
@@ -297,40 +312,88 @@ out: | |||
297 | return retval; | 312 | return retval; |
298 | } | 313 | } |
299 | 314 | ||
315 | static long vma_compute_subtree_gap(struct vm_area_struct *vma) | ||
316 | { | ||
317 | unsigned long max, subtree_gap; | ||
318 | max = vma->vm_start; | ||
319 | if (vma->vm_prev) | ||
320 | max -= vma->vm_prev->vm_end; | ||
321 | if (vma->vm_rb.rb_left) { | ||
322 | subtree_gap = rb_entry(vma->vm_rb.rb_left, | ||
323 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | ||
324 | if (subtree_gap > max) | ||
325 | max = subtree_gap; | ||
326 | } | ||
327 | if (vma->vm_rb.rb_right) { | ||
328 | subtree_gap = rb_entry(vma->vm_rb.rb_right, | ||
329 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | ||
330 | if (subtree_gap > max) | ||
331 | max = subtree_gap; | ||
332 | } | ||
333 | return max; | ||
334 | } | ||
335 | |||
300 | #ifdef CONFIG_DEBUG_VM_RB | 336 | #ifdef CONFIG_DEBUG_VM_RB |
301 | static int browse_rb(struct rb_root *root) | 337 | static int browse_rb(struct rb_root *root) |
302 | { | 338 | { |
303 | int i = 0, j; | 339 | int i = 0, j, bug = 0; |
304 | struct rb_node *nd, *pn = NULL; | 340 | struct rb_node *nd, *pn = NULL; |
305 | unsigned long prev = 0, pend = 0; | 341 | unsigned long prev = 0, pend = 0; |
306 | 342 | ||
307 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 343 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
308 | struct vm_area_struct *vma; | 344 | struct vm_area_struct *vma; |
309 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 345 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
310 | if (vma->vm_start < prev) | 346 | if (vma->vm_start < prev) { |
311 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; | 347 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev); |
312 | if (vma->vm_start < pend) | 348 | bug = 1; |
349 | } | ||
350 | if (vma->vm_start < pend) { | ||
313 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); | 351 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); |
314 | if (vma->vm_start > vma->vm_end) | 352 | bug = 1; |
315 | printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); | 353 | } |
354 | if (vma->vm_start > vma->vm_end) { | ||
355 | printk("vm_end %lx < vm_start %lx\n", | ||
356 | vma->vm_end, vma->vm_start); | ||
357 | bug = 1; | ||
358 | } | ||
359 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { | ||
360 | printk("free gap %lx, correct %lx\n", | ||
361 | vma->rb_subtree_gap, | ||
362 | vma_compute_subtree_gap(vma)); | ||
363 | bug = 1; | ||
364 | } | ||
316 | i++; | 365 | i++; |
317 | pn = nd; | 366 | pn = nd; |
318 | prev = vma->vm_start; | 367 | prev = vma->vm_start; |
319 | pend = vma->vm_end; | 368 | pend = vma->vm_end; |
320 | } | 369 | } |
321 | j = 0; | 370 | j = 0; |
322 | for (nd = pn; nd; nd = rb_prev(nd)) { | 371 | for (nd = pn; nd; nd = rb_prev(nd)) |
323 | j++; | 372 | j++; |
373 | if (i != j) { | ||
374 | printk("backwards %d, forwards %d\n", j, i); | ||
375 | bug = 1; | ||
376 | } | ||
377 | return bug ? -1 : i; | ||
378 | } | ||
379 | |||
380 | static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) | ||
381 | { | ||
382 | struct rb_node *nd; | ||
383 | |||
384 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | ||
385 | struct vm_area_struct *vma; | ||
386 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | ||
387 | BUG_ON(vma != ignore && | ||
388 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); | ||
324 | } | 389 | } |
325 | if (i != j) | ||
326 | printk("backwards %d, forwards %d\n", j, i), i = 0; | ||
327 | return i; | ||
328 | } | 390 | } |
329 | 391 | ||
330 | void validate_mm(struct mm_struct *mm) | 392 | void validate_mm(struct mm_struct *mm) |
331 | { | 393 | { |
332 | int bug = 0; | 394 | int bug = 0; |
333 | int i = 0; | 395 | int i = 0; |
396 | unsigned long highest_address = 0; | ||
334 | struct vm_area_struct *vma = mm->mmap; | 397 | struct vm_area_struct *vma = mm->mmap; |
335 | while (vma) { | 398 | while (vma) { |
336 | struct anon_vma_chain *avc; | 399 | struct anon_vma_chain *avc; |
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm) | |||
338 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 401 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
339 | anon_vma_interval_tree_verify(avc); | 402 | anon_vma_interval_tree_verify(avc); |
340 | vma_unlock_anon_vma(vma); | 403 | vma_unlock_anon_vma(vma); |
404 | highest_address = vma->vm_end; | ||
341 | vma = vma->vm_next; | 405 | vma = vma->vm_next; |
342 | i++; | 406 | i++; |
343 | } | 407 | } |
344 | if (i != mm->map_count) | 408 | if (i != mm->map_count) { |
345 | printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; | 409 | printk("map_count %d vm_next %d\n", mm->map_count, i); |
410 | bug = 1; | ||
411 | } | ||
412 | if (highest_address != mm->highest_vm_end) { | ||
413 | printk("mm->highest_vm_end %lx, found %lx\n", | ||
414 | mm->highest_vm_end, highest_address); | ||
415 | bug = 1; | ||
416 | } | ||
346 | i = browse_rb(&mm->mm_rb); | 417 | i = browse_rb(&mm->mm_rb); |
347 | if (i != mm->map_count) | 418 | if (i != mm->map_count) { |
348 | printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; | 419 | printk("map_count %d rb %d\n", mm->map_count, i); |
420 | bug = 1; | ||
421 | } | ||
349 | BUG_ON(bug); | 422 | BUG_ON(bug); |
350 | } | 423 | } |
351 | #else | 424 | #else |
425 | #define validate_mm_rb(root, ignore) do { } while (0) | ||
352 | #define validate_mm(mm) do { } while (0) | 426 | #define validate_mm(mm) do { } while (0) |
353 | #endif | 427 | #endif |
354 | 428 | ||
429 | RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, | ||
430 | unsigned long, rb_subtree_gap, vma_compute_subtree_gap) | ||
431 | |||
432 | /* | ||
433 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or | ||
434 | * vma->vm_prev->vm_end values changed, without modifying the vma's position | ||
435 | * in the rbtree. | ||
436 | */ | ||
437 | static void vma_gap_update(struct vm_area_struct *vma) | ||
438 | { | ||
439 | /* | ||
440 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback | ||
441 | * function that does exacltly what we want. | ||
442 | */ | ||
443 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); | ||
444 | } | ||
445 | |||
446 | static inline void vma_rb_insert(struct vm_area_struct *vma, | ||
447 | struct rb_root *root) | ||
448 | { | ||
449 | /* All rb_subtree_gap values must be consistent prior to insertion */ | ||
450 | validate_mm_rb(root, NULL); | ||
451 | |||
452 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | ||
453 | } | ||
454 | |||
455 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) | ||
456 | { | ||
457 | /* | ||
458 | * All rb_subtree_gap values must be consistent prior to erase, | ||
459 | * with the possible exception of the vma being erased. | ||
460 | */ | ||
461 | validate_mm_rb(root, vma); | ||
462 | |||
463 | /* | ||
464 | * Note rb_erase_augmented is a fairly large inline function, | ||
465 | * so make sure we instantiate it only once with our desired | ||
466 | * augmented rbtree callbacks. | ||
467 | */ | ||
468 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | ||
469 | } | ||
470 | |||
355 | /* | 471 | /* |
356 | * vma has some anon_vma assigned, and is already inserted on that | 472 | * vma has some anon_vma assigned, and is already inserted on that |
357 | * anon_vma's interval trees. | 473 | * anon_vma's interval trees. |
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, | |||
421 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 537 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
422 | struct rb_node **rb_link, struct rb_node *rb_parent) | 538 | struct rb_node **rb_link, struct rb_node *rb_parent) |
423 | { | 539 | { |
540 | /* Update tracking information for the gap following the new vma. */ | ||
541 | if (vma->vm_next) | ||
542 | vma_gap_update(vma->vm_next); | ||
543 | else | ||
544 | mm->highest_vm_end = vma->vm_end; | ||
545 | |||
546 | /* | ||
547 | * vma->vm_prev wasn't known when we followed the rbtree to find the | ||
548 | * correct insertion point for that vma. As a result, we could not | ||
549 | * update the vma vm_rb parents rb_subtree_gap values on the way down. | ||
550 | * So, we first insert the vma with a zero rb_subtree_gap value | ||
551 | * (to be consistent with what we did on the way down), and then | ||
552 | * immediately update the gap to the correct value. Finally we | ||
553 | * rebalance the rbtree after all augmented values have been set. | ||
554 | */ | ||
424 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); | 555 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); |
425 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | 556 | vma->rb_subtree_gap = 0; |
557 | vma_gap_update(vma); | ||
558 | vma_rb_insert(vma, &mm->mm_rb); | ||
426 | } | 559 | } |
427 | 560 | ||
428 | static void __vma_link_file(struct vm_area_struct *vma) | 561 | static void __vma_link_file(struct vm_area_struct *vma) |
@@ -498,12 +631,12 @@ static inline void | |||
498 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | 631 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, |
499 | struct vm_area_struct *prev) | 632 | struct vm_area_struct *prev) |
500 | { | 633 | { |
501 | struct vm_area_struct *next = vma->vm_next; | 634 | struct vm_area_struct *next; |
502 | 635 | ||
503 | prev->vm_next = next; | 636 | vma_rb_erase(vma, &mm->mm_rb); |
637 | prev->vm_next = next = vma->vm_next; | ||
504 | if (next) | 638 | if (next) |
505 | next->vm_prev = prev; | 639 | next->vm_prev = prev; |
506 | rb_erase(&vma->vm_rb, &mm->mm_rb); | ||
507 | if (mm->mmap_cache == vma) | 640 | if (mm->mmap_cache == vma) |
508 | mm->mmap_cache = prev; | 641 | mm->mmap_cache = prev; |
509 | } | 642 | } |
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
525 | struct rb_root *root = NULL; | 658 | struct rb_root *root = NULL; |
526 | struct anon_vma *anon_vma = NULL; | 659 | struct anon_vma *anon_vma = NULL; |
527 | struct file *file = vma->vm_file; | 660 | struct file *file = vma->vm_file; |
661 | bool start_changed = false, end_changed = false; | ||
528 | long adjust_next = 0; | 662 | long adjust_next = 0; |
529 | int remove_next = 0; | 663 | int remove_next = 0; |
530 | 664 | ||
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
602 | if (anon_vma) { | 736 | if (anon_vma) { |
603 | VM_BUG_ON(adjust_next && next->anon_vma && | 737 | VM_BUG_ON(adjust_next && next->anon_vma && |
604 | anon_vma != next->anon_vma); | 738 | anon_vma != next->anon_vma); |
605 | anon_vma_lock(anon_vma); | 739 | anon_vma_lock_write(anon_vma); |
606 | anon_vma_interval_tree_pre_update_vma(vma); | 740 | anon_vma_interval_tree_pre_update_vma(vma); |
607 | if (adjust_next) | 741 | if (adjust_next) |
608 | anon_vma_interval_tree_pre_update_vma(next); | 742 | anon_vma_interval_tree_pre_update_vma(next); |
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end); | |||
615 | vma_interval_tree_remove(next, root); | 749 | vma_interval_tree_remove(next, root); |
616 | } | 750 | } |
617 | 751 | ||
618 | vma->vm_start = start; | 752 | if (start != vma->vm_start) { |
619 | vma->vm_end = end; | 753 | vma->vm_start = start; |
754 | start_changed = true; | ||
755 | } | ||
756 | if (end != vma->vm_end) { | ||
757 | vma->vm_end = end; | ||
758 | end_changed = true; | ||
759 | } | ||
620 | vma->vm_pgoff = pgoff; | 760 | vma->vm_pgoff = pgoff; |
621 | if (adjust_next) { | 761 | if (adjust_next) { |
622 | next->vm_start += adjust_next << PAGE_SHIFT; | 762 | next->vm_start += adjust_next << PAGE_SHIFT; |
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end); | |||
645 | * (it may either follow vma or precede it). | 785 | * (it may either follow vma or precede it). |
646 | */ | 786 | */ |
647 | __insert_vm_struct(mm, insert); | 787 | __insert_vm_struct(mm, insert); |
788 | } else { | ||
789 | if (start_changed) | ||
790 | vma_gap_update(vma); | ||
791 | if (end_changed) { | ||
792 | if (!next) | ||
793 | mm->highest_vm_end = end; | ||
794 | else if (!adjust_next) | ||
795 | vma_gap_update(next); | ||
796 | } | ||
648 | } | 797 | } |
649 | 798 | ||
650 | if (anon_vma) { | 799 | if (anon_vma) { |
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end); | |||
678 | * we must remove another next too. It would clutter | 827 | * we must remove another next too. It would clutter |
679 | * up the code too much to do both in one go. | 828 | * up the code too much to do both in one go. |
680 | */ | 829 | */ |
681 | if (remove_next == 2) { | 830 | next = vma->vm_next; |
682 | next = vma->vm_next; | 831 | if (remove_next == 2) |
683 | goto again; | 832 | goto again; |
684 | } | 833 | else if (next) |
834 | vma_gap_update(next); | ||
835 | else | ||
836 | mm->highest_vm_end = end; | ||
685 | } | 837 | } |
686 | if (insert && file) | 838 | if (insert && file) |
687 | uprobe_mmap(insert); | 839 | uprobe_mmap(insert); |
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1153 | * memory so no accounting is necessary | 1305 | * memory so no accounting is necessary |
1154 | */ | 1306 | */ |
1155 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, | 1307 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, |
1156 | VM_NORESERVE, &user, | 1308 | VM_NORESERVE, |
1157 | HUGETLB_ANONHUGE_INODE); | 1309 | &user, HUGETLB_ANONHUGE_INODE, |
1310 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); | ||
1158 | if (IS_ERR(file)) | 1311 | if (IS_ERR(file)) |
1159 | return PTR_ERR(file); | 1312 | return PTR_ERR(file); |
1160 | } | 1313 | } |
@@ -1335,7 +1488,11 @@ munmap_back: | |||
1335 | * | 1488 | * |
1336 | * Answer: Yes, several device drivers can do it in their | 1489 | * Answer: Yes, several device drivers can do it in their |
1337 | * f_op->mmap method. -DaveM | 1490 | * f_op->mmap method. -DaveM |
1491 | * Bug: If addr is changed, prev, rb_link, rb_parent should | ||
1492 | * be updated for vma_link() | ||
1338 | */ | 1493 | */ |
1494 | WARN_ON_ONCE(addr != vma->vm_start); | ||
1495 | |||
1339 | addr = vma->vm_start; | 1496 | addr = vma->vm_start; |
1340 | pgoff = vma->vm_pgoff; | 1497 | pgoff = vma->vm_pgoff; |
1341 | vm_flags = vma->vm_flags; | 1498 | vm_flags = vma->vm_flags; |
@@ -1400,6 +1557,206 @@ unacct_error: | |||
1400 | return error; | 1557 | return error; |
1401 | } | 1558 | } |
1402 | 1559 | ||
1560 | unsigned long unmapped_area(struct vm_unmapped_area_info *info) | ||
1561 | { | ||
1562 | /* | ||
1563 | * We implement the search by looking for an rbtree node that | ||
1564 | * immediately follows a suitable gap. That is, | ||
1565 | * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; | ||
1566 | * - gap_end = vma->vm_start >= info->low_limit + length; | ||
1567 | * - gap_end - gap_start >= length | ||
1568 | */ | ||
1569 | |||
1570 | struct mm_struct *mm = current->mm; | ||
1571 | struct vm_area_struct *vma; | ||
1572 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | ||
1573 | |||
1574 | /* Adjust search length to account for worst case alignment overhead */ | ||
1575 | length = info->length + info->align_mask; | ||
1576 | if (length < info->length) | ||
1577 | return -ENOMEM; | ||
1578 | |||
1579 | /* Adjust search limits by the desired length */ | ||
1580 | if (info->high_limit < length) | ||
1581 | return -ENOMEM; | ||
1582 | high_limit = info->high_limit - length; | ||
1583 | |||
1584 | if (info->low_limit > high_limit) | ||
1585 | return -ENOMEM; | ||
1586 | low_limit = info->low_limit + length; | ||
1587 | |||
1588 | /* Check if rbtree root looks promising */ | ||
1589 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | ||
1590 | goto check_highest; | ||
1591 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | ||
1592 | if (vma->rb_subtree_gap < length) | ||
1593 | goto check_highest; | ||
1594 | |||
1595 | while (true) { | ||
1596 | /* Visit left subtree if it looks promising */ | ||
1597 | gap_end = vma->vm_start; | ||
1598 | if (gap_end >= low_limit && vma->vm_rb.rb_left) { | ||
1599 | struct vm_area_struct *left = | ||
1600 | rb_entry(vma->vm_rb.rb_left, | ||
1601 | struct vm_area_struct, vm_rb); | ||
1602 | if (left->rb_subtree_gap >= length) { | ||
1603 | vma = left; | ||
1604 | continue; | ||
1605 | } | ||
1606 | } | ||
1607 | |||
1608 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | ||
1609 | check_current: | ||
1610 | /* Check if current node has a suitable gap */ | ||
1611 | if (gap_start > high_limit) | ||
1612 | return -ENOMEM; | ||
1613 | if (gap_end >= low_limit && gap_end - gap_start >= length) | ||
1614 | goto found; | ||
1615 | |||
1616 | /* Visit right subtree if it looks promising */ | ||
1617 | if (vma->vm_rb.rb_right) { | ||
1618 | struct vm_area_struct *right = | ||
1619 | rb_entry(vma->vm_rb.rb_right, | ||
1620 | struct vm_area_struct, vm_rb); | ||
1621 | if (right->rb_subtree_gap >= length) { | ||
1622 | vma = right; | ||
1623 | continue; | ||
1624 | } | ||
1625 | } | ||
1626 | |||
1627 | /* Go back up the rbtree to find next candidate node */ | ||
1628 | while (true) { | ||
1629 | struct rb_node *prev = &vma->vm_rb; | ||
1630 | if (!rb_parent(prev)) | ||
1631 | goto check_highest; | ||
1632 | vma = rb_entry(rb_parent(prev), | ||
1633 | struct vm_area_struct, vm_rb); | ||
1634 | if (prev == vma->vm_rb.rb_left) { | ||
1635 | gap_start = vma->vm_prev->vm_end; | ||
1636 | gap_end = vma->vm_start; | ||
1637 | goto check_current; | ||
1638 | } | ||
1639 | } | ||
1640 | } | ||
1641 | |||
1642 | check_highest: | ||
1643 | /* Check highest gap, which does not precede any rbtree node */ | ||
1644 | gap_start = mm->highest_vm_end; | ||
1645 | gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ | ||
1646 | if (gap_start > high_limit) | ||
1647 | return -ENOMEM; | ||
1648 | |||
1649 | found: | ||
1650 | /* We found a suitable gap. Clip it with the original low_limit. */ | ||
1651 | if (gap_start < info->low_limit) | ||
1652 | gap_start = info->low_limit; | ||
1653 | |||
1654 | /* Adjust gap address to the desired alignment */ | ||
1655 | gap_start += (info->align_offset - gap_start) & info->align_mask; | ||
1656 | |||
1657 | VM_BUG_ON(gap_start + info->length > info->high_limit); | ||
1658 | VM_BUG_ON(gap_start + info->length > gap_end); | ||
1659 | return gap_start; | ||
1660 | } | ||
1661 | |||
1662 | unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) | ||
1663 | { | ||
1664 | struct mm_struct *mm = current->mm; | ||
1665 | struct vm_area_struct *vma; | ||
1666 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | ||
1667 | |||
1668 | /* Adjust search length to account for worst case alignment overhead */ | ||
1669 | length = info->length + info->align_mask; | ||
1670 | if (length < info->length) | ||
1671 | return -ENOMEM; | ||
1672 | |||
1673 | /* | ||
1674 | * Adjust search limits by the desired length. | ||
1675 | * See implementation comment at top of unmapped_area(). | ||
1676 | */ | ||
1677 | gap_end = info->high_limit; | ||
1678 | if (gap_end < length) | ||
1679 | return -ENOMEM; | ||
1680 | high_limit = gap_end - length; | ||
1681 | |||
1682 | if (info->low_limit > high_limit) | ||
1683 | return -ENOMEM; | ||
1684 | low_limit = info->low_limit + length; | ||
1685 | |||
1686 | /* Check highest gap, which does not precede any rbtree node */ | ||
1687 | gap_start = mm->highest_vm_end; | ||
1688 | if (gap_start <= high_limit) | ||
1689 | goto found_highest; | ||
1690 | |||
1691 | /* Check if rbtree root looks promising */ | ||
1692 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | ||
1693 | return -ENOMEM; | ||
1694 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | ||
1695 | if (vma->rb_subtree_gap < length) | ||
1696 | return -ENOMEM; | ||
1697 | |||
1698 | while (true) { | ||
1699 | /* Visit right subtree if it looks promising */ | ||
1700 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | ||
1701 | if (gap_start <= high_limit && vma->vm_rb.rb_right) { | ||
1702 | struct vm_area_struct *right = | ||
1703 | rb_entry(vma->vm_rb.rb_right, | ||
1704 | struct vm_area_struct, vm_rb); | ||
1705 | if (right->rb_subtree_gap >= length) { | ||
1706 | vma = right; | ||
1707 | continue; | ||
1708 | } | ||
1709 | } | ||
1710 | |||
1711 | check_current: | ||
1712 | /* Check if current node has a suitable gap */ | ||
1713 | gap_end = vma->vm_start; | ||
1714 | if (gap_end < low_limit) | ||
1715 | return -ENOMEM; | ||
1716 | if (gap_start <= high_limit && gap_end - gap_start >= length) | ||
1717 | goto found; | ||
1718 | |||
1719 | /* Visit left subtree if it looks promising */ | ||
1720 | if (vma->vm_rb.rb_left) { | ||
1721 | struct vm_area_struct *left = | ||
1722 | rb_entry(vma->vm_rb.rb_left, | ||
1723 | struct vm_area_struct, vm_rb); | ||
1724 | if (left->rb_subtree_gap >= length) { | ||
1725 | vma = left; | ||
1726 | continue; | ||
1727 | } | ||
1728 | } | ||
1729 | |||
1730 | /* Go back up the rbtree to find next candidate node */ | ||
1731 | while (true) { | ||
1732 | struct rb_node *prev = &vma->vm_rb; | ||
1733 | if (!rb_parent(prev)) | ||
1734 | return -ENOMEM; | ||
1735 | vma = rb_entry(rb_parent(prev), | ||
1736 | struct vm_area_struct, vm_rb); | ||
1737 | if (prev == vma->vm_rb.rb_right) { | ||
1738 | gap_start = vma->vm_prev ? | ||
1739 | vma->vm_prev->vm_end : 0; | ||
1740 | goto check_current; | ||
1741 | } | ||
1742 | } | ||
1743 | } | ||
1744 | |||
1745 | found: | ||
1746 | /* We found a suitable gap. Clip it with the original high_limit. */ | ||
1747 | if (gap_end > info->high_limit) | ||
1748 | gap_end = info->high_limit; | ||
1749 | |||
1750 | found_highest: | ||
1751 | /* Compute highest gap address at the desired alignment */ | ||
1752 | gap_end -= info->length; | ||
1753 | gap_end -= (gap_end - info->align_offset) & info->align_mask; | ||
1754 | |||
1755 | VM_BUG_ON(gap_end < info->low_limit); | ||
1756 | VM_BUG_ON(gap_end < gap_start); | ||
1757 | return gap_end; | ||
1758 | } | ||
1759 | |||
1403 | /* Get an address range which is currently unmapped. | 1760 | /* Get an address range which is currently unmapped. |
1404 | * For shmat() with addr=0. | 1761 | * For shmat() with addr=0. |
1405 | * | 1762 | * |
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1418 | { | 1775 | { |
1419 | struct mm_struct *mm = current->mm; | 1776 | struct mm_struct *mm = current->mm; |
1420 | struct vm_area_struct *vma; | 1777 | struct vm_area_struct *vma; |
1421 | unsigned long start_addr; | 1778 | struct vm_unmapped_area_info info; |
1422 | 1779 | ||
1423 | if (len > TASK_SIZE) | 1780 | if (len > TASK_SIZE) |
1424 | return -ENOMEM; | 1781 | return -ENOMEM; |
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1433 | (!vma || addr + len <= vma->vm_start)) | 1790 | (!vma || addr + len <= vma->vm_start)) |
1434 | return addr; | 1791 | return addr; |
1435 | } | 1792 | } |
1436 | if (len > mm->cached_hole_size) { | ||
1437 | start_addr = addr = mm->free_area_cache; | ||
1438 | } else { | ||
1439 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
1440 | mm->cached_hole_size = 0; | ||
1441 | } | ||
1442 | 1793 | ||
1443 | full_search: | 1794 | info.flags = 0; |
1444 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | 1795 | info.length = len; |
1445 | /* At this point: (!vma || addr < vma->vm_end). */ | 1796 | info.low_limit = TASK_UNMAPPED_BASE; |
1446 | if (TASK_SIZE - len < addr) { | 1797 | info.high_limit = TASK_SIZE; |
1447 | /* | 1798 | info.align_mask = 0; |
1448 | * Start a new search - just in case we missed | 1799 | return vm_unmapped_area(&info); |
1449 | * some holes. | ||
1450 | */ | ||
1451 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
1452 | addr = TASK_UNMAPPED_BASE; | ||
1453 | start_addr = addr; | ||
1454 | mm->cached_hole_size = 0; | ||
1455 | goto full_search; | ||
1456 | } | ||
1457 | return -ENOMEM; | ||
1458 | } | ||
1459 | if (!vma || addr + len <= vma->vm_start) { | ||
1460 | /* | ||
1461 | * Remember the place where we stopped the search: | ||
1462 | */ | ||
1463 | mm->free_area_cache = addr + len; | ||
1464 | return addr; | ||
1465 | } | ||
1466 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1467 | mm->cached_hole_size = vma->vm_start - addr; | ||
1468 | addr = vma->vm_end; | ||
1469 | } | ||
1470 | } | 1800 | } |
1471 | #endif | 1801 | #endif |
1472 | 1802 | ||
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1491 | { | 1821 | { |
1492 | struct vm_area_struct *vma; | 1822 | struct vm_area_struct *vma; |
1493 | struct mm_struct *mm = current->mm; | 1823 | struct mm_struct *mm = current->mm; |
1494 | unsigned long addr = addr0, start_addr; | 1824 | unsigned long addr = addr0; |
1825 | struct vm_unmapped_area_info info; | ||
1495 | 1826 | ||
1496 | /* requested length too big for entire address space */ | 1827 | /* requested length too big for entire address space */ |
1497 | if (len > TASK_SIZE) | 1828 | if (len > TASK_SIZE) |
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1509 | return addr; | 1840 | return addr; |
1510 | } | 1841 | } |
1511 | 1842 | ||
1512 | /* check if free_area_cache is useful for us */ | 1843 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
1513 | if (len <= mm->cached_hole_size) { | 1844 | info.length = len; |
1514 | mm->cached_hole_size = 0; | 1845 | info.low_limit = PAGE_SIZE; |
1515 | mm->free_area_cache = mm->mmap_base; | 1846 | info.high_limit = mm->mmap_base; |
1516 | } | 1847 | info.align_mask = 0; |
1517 | 1848 | addr = vm_unmapped_area(&info); | |
1518 | try_again: | ||
1519 | /* either no address requested or can't fit in requested address hole */ | ||
1520 | start_addr = addr = mm->free_area_cache; | ||
1521 | |||
1522 | if (addr < len) | ||
1523 | goto fail; | ||
1524 | |||
1525 | addr -= len; | ||
1526 | do { | ||
1527 | /* | ||
1528 | * Lookup failure means no vma is above this address, | ||
1529 | * else if new region fits below vma->vm_start, | ||
1530 | * return with success: | ||
1531 | */ | ||
1532 | vma = find_vma(mm, addr); | ||
1533 | if (!vma || addr+len <= vma->vm_start) | ||
1534 | /* remember the address as a hint for next time */ | ||
1535 | return (mm->free_area_cache = addr); | ||
1536 | |||
1537 | /* remember the largest hole we saw so far */ | ||
1538 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1539 | mm->cached_hole_size = vma->vm_start - addr; | ||
1540 | |||
1541 | /* try just below the current vma->vm_start */ | ||
1542 | addr = vma->vm_start-len; | ||
1543 | } while (len < vma->vm_start); | ||
1544 | |||
1545 | fail: | ||
1546 | /* | ||
1547 | * if hint left us with no space for the requested | ||
1548 | * mapping then try again: | ||
1549 | * | ||
1550 | * Note: this is different with the case of bottomup | ||
1551 | * which does the fully line-search, but we use find_vma | ||
1552 | * here that causes some holes skipped. | ||
1553 | */ | ||
1554 | if (start_addr != mm->mmap_base) { | ||
1555 | mm->free_area_cache = mm->mmap_base; | ||
1556 | mm->cached_hole_size = 0; | ||
1557 | goto try_again; | ||
1558 | } | ||
1559 | 1849 | ||
1560 | /* | 1850 | /* |
1561 | * A failed mmap() very likely causes application failure, | 1851 | * A failed mmap() very likely causes application failure, |
@@ -1563,14 +1853,13 @@ fail: | |||
1563 | * can happen with large stack limits and large mmap() | 1853 | * can happen with large stack limits and large mmap() |
1564 | * allocations. | 1854 | * allocations. |
1565 | */ | 1855 | */ |
1566 | mm->cached_hole_size = ~0UL; | 1856 | if (addr & ~PAGE_MASK) { |
1567 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 1857 | VM_BUG_ON(addr != -ENOMEM); |
1568 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 1858 | info.flags = 0; |
1569 | /* | 1859 | info.low_limit = TASK_UNMAPPED_BASE; |
1570 | * Restore the topdown base: | 1860 | info.high_limit = TASK_SIZE; |
1571 | */ | 1861 | addr = vm_unmapped_area(&info); |
1572 | mm->free_area_cache = mm->mmap_base; | 1862 | } |
1573 | mm->cached_hole_size = ~0UL; | ||
1574 | 1863 | ||
1575 | return addr; | 1864 | return addr; |
1576 | } | 1865 | } |
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1780 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 2069 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
1781 | error = acct_stack_growth(vma, size, grow); | 2070 | error = acct_stack_growth(vma, size, grow); |
1782 | if (!error) { | 2071 | if (!error) { |
2072 | /* | ||
2073 | * vma_gap_update() doesn't support concurrent | ||
2074 | * updates, but we only hold a shared mmap_sem | ||
2075 | * lock here, so we need to protect against | ||
2076 | * concurrent vma expansions. | ||
2077 | * vma_lock_anon_vma() doesn't help here, as | ||
2078 | * we don't guarantee that all growable vmas | ||
2079 | * in a mm share the same root anon vma. | ||
2080 | * So, we reuse mm->page_table_lock to guard | ||
2081 | * against concurrent vma expansions. | ||
2082 | */ | ||
2083 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1783 | anon_vma_interval_tree_pre_update_vma(vma); | 2084 | anon_vma_interval_tree_pre_update_vma(vma); |
1784 | vma->vm_end = address; | 2085 | vma->vm_end = address; |
1785 | anon_vma_interval_tree_post_update_vma(vma); | 2086 | anon_vma_interval_tree_post_update_vma(vma); |
2087 | if (vma->vm_next) | ||
2088 | vma_gap_update(vma->vm_next); | ||
2089 | else | ||
2090 | vma->vm_mm->highest_vm_end = address; | ||
2091 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
2092 | |||
1786 | perf_event_mmap(vma); | 2093 | perf_event_mmap(vma); |
1787 | } | 2094 | } |
1788 | } | 2095 | } |
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1833 | if (grow <= vma->vm_pgoff) { | 2140 | if (grow <= vma->vm_pgoff) { |
1834 | error = acct_stack_growth(vma, size, grow); | 2141 | error = acct_stack_growth(vma, size, grow); |
1835 | if (!error) { | 2142 | if (!error) { |
2143 | /* | ||
2144 | * vma_gap_update() doesn't support concurrent | ||
2145 | * updates, but we only hold a shared mmap_sem | ||
2146 | * lock here, so we need to protect against | ||
2147 | * concurrent vma expansions. | ||
2148 | * vma_lock_anon_vma() doesn't help here, as | ||
2149 | * we don't guarantee that all growable vmas | ||
2150 | * in a mm share the same root anon vma. | ||
2151 | * So, we reuse mm->page_table_lock to guard | ||
2152 | * against concurrent vma expansions. | ||
2153 | */ | ||
2154 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1836 | anon_vma_interval_tree_pre_update_vma(vma); | 2155 | anon_vma_interval_tree_pre_update_vma(vma); |
1837 | vma->vm_start = address; | 2156 | vma->vm_start = address; |
1838 | vma->vm_pgoff -= grow; | 2157 | vma->vm_pgoff -= grow; |
1839 | anon_vma_interval_tree_post_update_vma(vma); | 2158 | anon_vma_interval_tree_post_update_vma(vma); |
2159 | vma_gap_update(vma); | ||
2160 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
2161 | |||
1840 | perf_event_mmap(vma); | 2162 | perf_event_mmap(vma); |
1841 | } | 2163 | } |
1842 | } | 2164 | } |
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1959 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 2281 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
1960 | vma->vm_prev = NULL; | 2282 | vma->vm_prev = NULL; |
1961 | do { | 2283 | do { |
1962 | rb_erase(&vma->vm_rb, &mm->mm_rb); | 2284 | vma_rb_erase(vma, &mm->mm_rb); |
1963 | mm->map_count--; | 2285 | mm->map_count--; |
1964 | tail_vma = vma; | 2286 | tail_vma = vma; |
1965 | vma = vma->vm_next; | 2287 | vma = vma->vm_next; |
1966 | } while (vma && vma->vm_start < end); | 2288 | } while (vma && vma->vm_start < end); |
1967 | *insertion_point = vma; | 2289 | *insertion_point = vma; |
1968 | if (vma) | 2290 | if (vma) { |
1969 | vma->vm_prev = prev; | 2291 | vma->vm_prev = prev; |
2292 | vma_gap_update(vma); | ||
2293 | } else | ||
2294 | mm->highest_vm_end = prev ? prev->vm_end : 0; | ||
1970 | tail_vma->vm_next = NULL; | 2295 | tail_vma->vm_next = NULL; |
1971 | if (mm->unmap_area == arch_unmap_area) | 2296 | if (mm->unmap_area == arch_unmap_area) |
1972 | addr = prev ? prev->vm_end : mm->mmap_base; | 2297 | addr = prev ? prev->vm_end : mm->mmap_base; |
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2561 | * The LSB of head.next can't change from under us | 2886 | * The LSB of head.next can't change from under us |
2562 | * because we hold the mm_all_locks_mutex. | 2887 | * because we hold the mm_all_locks_mutex. |
2563 | */ | 2888 | */ |
2564 | mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); | 2889 | down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); |
2565 | /* | 2890 | /* |
2566 | * We can safely modify head.next after taking the | 2891 | * We can safely modify head.next after taking the |
2567 | * anon_vma->root->mutex. If some other vma in this mm shares | 2892 | * anon_vma->root->rwsem. If some other vma in this mm shares |
2568 | * the same anon_vma we won't take it again. | 2893 | * the same anon_vma we won't take it again. |
2569 | * | 2894 | * |
2570 | * No need of atomic instructions here, head.next | 2895 | * No need of atomic instructions here, head.next |
2571 | * can't change from under us thanks to the | 2896 | * can't change from under us thanks to the |
2572 | * anon_vma->root->mutex. | 2897 | * anon_vma->root->rwsem. |
2573 | */ | 2898 | */ |
2574 | if (__test_and_set_bit(0, (unsigned long *) | 2899 | if (__test_and_set_bit(0, (unsigned long *) |
2575 | &anon_vma->root->rb_root.rb_node)) | 2900 | &anon_vma->root->rb_root.rb_node)) |
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2671 | * | 2996 | * |
2672 | * No need of atomic instructions here, head.next | 2997 | * No need of atomic instructions here, head.next |
2673 | * can't change from under us until we release the | 2998 | * can't change from under us until we release the |
2674 | * anon_vma->root->mutex. | 2999 | * anon_vma->root->rwsem. |
2675 | */ | 3000 | */ |
2676 | if (!__test_and_clear_bit(0, (unsigned long *) | 3001 | if (!__test_and_clear_bit(0, (unsigned long *) |
2677 | &anon_vma->root->rb_root.rb_node)) | 3002 | &anon_vma->root->rb_root.rb_node)) |
diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab6..94722a4d6b43 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
35 | } | 35 | } |
36 | #endif | 36 | #endif |
37 | 37 | ||
38 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
39 | unsigned long addr, unsigned long end, pgprot_t newprot, | 39 | unsigned long addr, unsigned long end, pgprot_t newprot, |
40 | int dirty_accountable) | 40 | int dirty_accountable, int prot_numa, bool *ret_all_same_node) |
41 | { | 41 | { |
42 | struct mm_struct *mm = vma->vm_mm; | ||
42 | pte_t *pte, oldpte; | 43 | pte_t *pte, oldpte; |
43 | spinlock_t *ptl; | 44 | spinlock_t *ptl; |
45 | unsigned long pages = 0; | ||
46 | bool all_same_node = true; | ||
47 | int last_nid = -1; | ||
44 | 48 | ||
45 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 49 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
46 | arch_enter_lazy_mmu_mode(); | 50 | arch_enter_lazy_mmu_mode(); |
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
48 | oldpte = *pte; | 52 | oldpte = *pte; |
49 | if (pte_present(oldpte)) { | 53 | if (pte_present(oldpte)) { |
50 | pte_t ptent; | 54 | pte_t ptent; |
55 | bool updated = false; | ||
51 | 56 | ||
52 | ptent = ptep_modify_prot_start(mm, addr, pte); | 57 | ptent = ptep_modify_prot_start(mm, addr, pte); |
53 | ptent = pte_modify(ptent, newprot); | 58 | if (!prot_numa) { |
59 | ptent = pte_modify(ptent, newprot); | ||
60 | updated = true; | ||
61 | } else { | ||
62 | struct page *page; | ||
63 | |||
64 | page = vm_normal_page(vma, addr, oldpte); | ||
65 | if (page) { | ||
66 | int this_nid = page_to_nid(page); | ||
67 | if (last_nid == -1) | ||
68 | last_nid = this_nid; | ||
69 | if (last_nid != this_nid) | ||
70 | all_same_node = false; | ||
71 | |||
72 | /* only check non-shared pages */ | ||
73 | if (!pte_numa(oldpte) && | ||
74 | page_mapcount(page) == 1) { | ||
75 | ptent = pte_mknuma(ptent); | ||
76 | updated = true; | ||
77 | } | ||
78 | } | ||
79 | } | ||
54 | 80 | ||
55 | /* | 81 | /* |
56 | * Avoid taking write faults for pages we know to be | 82 | * Avoid taking write faults for pages we know to be |
57 | * dirty. | 83 | * dirty. |
58 | */ | 84 | */ |
59 | if (dirty_accountable && pte_dirty(ptent)) | 85 | if (dirty_accountable && pte_dirty(ptent)) { |
60 | ptent = pte_mkwrite(ptent); | 86 | ptent = pte_mkwrite(ptent); |
87 | updated = true; | ||
88 | } | ||
61 | 89 | ||
90 | if (updated) | ||
91 | pages++; | ||
62 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 92 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
63 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 93 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 94 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
72 | set_pte_at(mm, addr, pte, | 102 | set_pte_at(mm, addr, pte, |
73 | swp_entry_to_pte(entry)); | 103 | swp_entry_to_pte(entry)); |
74 | } | 104 | } |
105 | pages++; | ||
75 | } | 106 | } |
76 | } while (pte++, addr += PAGE_SIZE, addr != end); | 107 | } while (pte++, addr += PAGE_SIZE, addr != end); |
77 | arch_leave_lazy_mmu_mode(); | 108 | arch_leave_lazy_mmu_mode(); |
78 | pte_unmap_unlock(pte - 1, ptl); | 109 | pte_unmap_unlock(pte - 1, ptl); |
110 | |||
111 | *ret_all_same_node = all_same_node; | ||
112 | return pages; | ||
79 | } | 113 | } |
80 | 114 | ||
81 | static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 115 | #ifdef CONFIG_NUMA_BALANCING |
82 | unsigned long addr, unsigned long end, pgprot_t newprot, | 116 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, |
83 | int dirty_accountable) | 117 | pmd_t *pmd) |
118 | { | ||
119 | spin_lock(&mm->page_table_lock); | ||
120 | set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); | ||
121 | spin_unlock(&mm->page_table_lock); | ||
122 | } | ||
123 | #else | ||
124 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
125 | pmd_t *pmd) | ||
126 | { | ||
127 | BUG(); | ||
128 | } | ||
129 | #endif /* CONFIG_NUMA_BALANCING */ | ||
130 | |||
131 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | ||
132 | pud_t *pud, unsigned long addr, unsigned long end, | ||
133 | pgprot_t newprot, int dirty_accountable, int prot_numa) | ||
84 | { | 134 | { |
85 | pmd_t *pmd; | 135 | pmd_t *pmd; |
86 | unsigned long next; | 136 | unsigned long next; |
137 | unsigned long pages = 0; | ||
138 | bool all_same_node; | ||
87 | 139 | ||
88 | pmd = pmd_offset(pud, addr); | 140 | pmd = pmd_offset(pud, addr); |
89 | do { | 141 | do { |
90 | next = pmd_addr_end(addr, end); | 142 | next = pmd_addr_end(addr, end); |
91 | if (pmd_trans_huge(*pmd)) { | 143 | if (pmd_trans_huge(*pmd)) { |
92 | if (next - addr != HPAGE_PMD_SIZE) | 144 | if (next - addr != HPAGE_PMD_SIZE) |
93 | split_huge_page_pmd(vma->vm_mm, pmd); | 145 | split_huge_page_pmd(vma, addr, pmd); |
94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | 146 | else if (change_huge_pmd(vma, pmd, addr, newprot, |
147 | prot_numa)) { | ||
148 | pages += HPAGE_PMD_NR; | ||
95 | continue; | 149 | continue; |
150 | } | ||
96 | /* fall through */ | 151 | /* fall through */ |
97 | } | 152 | } |
98 | if (pmd_none_or_clear_bad(pmd)) | 153 | if (pmd_none_or_clear_bad(pmd)) |
99 | continue; | 154 | continue; |
100 | change_pte_range(vma->vm_mm, pmd, addr, next, newprot, | 155 | pages += change_pte_range(vma, pmd, addr, next, newprot, |
101 | dirty_accountable); | 156 | dirty_accountable, prot_numa, &all_same_node); |
157 | |||
158 | /* | ||
159 | * If we are changing protections for NUMA hinting faults then | ||
160 | * set pmd_numa if the examined pages were all on the same | ||
161 | * node. This allows a regular PMD to be handled as one fault | ||
162 | * and effectively batches the taking of the PTL | ||
163 | */ | ||
164 | if (prot_numa && all_same_node) | ||
165 | change_pmd_protnuma(vma->vm_mm, addr, pmd); | ||
102 | } while (pmd++, addr = next, addr != end); | 166 | } while (pmd++, addr = next, addr != end); |
167 | |||
168 | return pages; | ||
103 | } | 169 | } |
104 | 170 | ||
105 | static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 171 | static inline unsigned long change_pud_range(struct vm_area_struct *vma, |
106 | unsigned long addr, unsigned long end, pgprot_t newprot, | 172 | pgd_t *pgd, unsigned long addr, unsigned long end, |
107 | int dirty_accountable) | 173 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
108 | { | 174 | { |
109 | pud_t *pud; | 175 | pud_t *pud; |
110 | unsigned long next; | 176 | unsigned long next; |
177 | unsigned long pages = 0; | ||
111 | 178 | ||
112 | pud = pud_offset(pgd, addr); | 179 | pud = pud_offset(pgd, addr); |
113 | do { | 180 | do { |
114 | next = pud_addr_end(addr, end); | 181 | next = pud_addr_end(addr, end); |
115 | if (pud_none_or_clear_bad(pud)) | 182 | if (pud_none_or_clear_bad(pud)) |
116 | continue; | 183 | continue; |
117 | change_pmd_range(vma, pud, addr, next, newprot, | 184 | pages += change_pmd_range(vma, pud, addr, next, newprot, |
118 | dirty_accountable); | 185 | dirty_accountable, prot_numa); |
119 | } while (pud++, addr = next, addr != end); | 186 | } while (pud++, addr = next, addr != end); |
187 | |||
188 | return pages; | ||
120 | } | 189 | } |
121 | 190 | ||
122 | static void change_protection(struct vm_area_struct *vma, | 191 | static unsigned long change_protection_range(struct vm_area_struct *vma, |
123 | unsigned long addr, unsigned long end, pgprot_t newprot, | 192 | unsigned long addr, unsigned long end, pgprot_t newprot, |
124 | int dirty_accountable) | 193 | int dirty_accountable, int prot_numa) |
125 | { | 194 | { |
126 | struct mm_struct *mm = vma->vm_mm; | 195 | struct mm_struct *mm = vma->vm_mm; |
127 | pgd_t *pgd; | 196 | pgd_t *pgd; |
128 | unsigned long next; | 197 | unsigned long next; |
129 | unsigned long start = addr; | 198 | unsigned long start = addr; |
199 | unsigned long pages = 0; | ||
130 | 200 | ||
131 | BUG_ON(addr >= end); | 201 | BUG_ON(addr >= end); |
132 | pgd = pgd_offset(mm, addr); | 202 | pgd = pgd_offset(mm, addr); |
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma, | |||
135 | next = pgd_addr_end(addr, end); | 205 | next = pgd_addr_end(addr, end); |
136 | if (pgd_none_or_clear_bad(pgd)) | 206 | if (pgd_none_or_clear_bad(pgd)) |
137 | continue; | 207 | continue; |
138 | change_pud_range(vma, pgd, addr, next, newprot, | 208 | pages += change_pud_range(vma, pgd, addr, next, newprot, |
139 | dirty_accountable); | 209 | dirty_accountable, prot_numa); |
140 | } while (pgd++, addr = next, addr != end); | 210 | } while (pgd++, addr = next, addr != end); |
141 | flush_tlb_range(vma, start, end); | 211 | |
212 | /* Only flush the TLB if we actually modified any entries: */ | ||
213 | if (pages) | ||
214 | flush_tlb_range(vma, start, end); | ||
215 | |||
216 | return pages; | ||
217 | } | ||
218 | |||
219 | unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, | ||
220 | unsigned long end, pgprot_t newprot, | ||
221 | int dirty_accountable, int prot_numa) | ||
222 | { | ||
223 | struct mm_struct *mm = vma->vm_mm; | ||
224 | unsigned long pages; | ||
225 | |||
226 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
227 | if (is_vm_hugetlb_page(vma)) | ||
228 | pages = hugetlb_change_protection(vma, start, end, newprot); | ||
229 | else | ||
230 | pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); | ||
231 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
232 | |||
233 | return pages; | ||
142 | } | 234 | } |
143 | 235 | ||
144 | int | 236 | int |
@@ -213,12 +305,9 @@ success: | |||
213 | dirty_accountable = 1; | 305 | dirty_accountable = 1; |
214 | } | 306 | } |
215 | 307 | ||
216 | mmu_notifier_invalidate_range_start(mm, start, end); | 308 | change_protection(vma, start, end, vma->vm_page_prot, |
217 | if (is_vm_hugetlb_page(vma)) | 309 | dirty_accountable, 0); |
218 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | 310 | |
219 | else | ||
220 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | ||
221 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
222 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 311 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
223 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 312 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
224 | perf_event_mmap(vma); | 313 | perf_event_mmap(vma); |
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
274 | error = -EINVAL; | 363 | error = -EINVAL; |
275 | if (!(vma->vm_flags & VM_GROWSDOWN)) | 364 | if (!(vma->vm_flags & VM_GROWSDOWN)) |
276 | goto out; | 365 | goto out; |
277 | } | 366 | } else { |
278 | else { | ||
279 | if (vma->vm_start > start) | 367 | if (vma->vm_start > start) |
280 | goto out; | 368 | goto out; |
281 | if (unlikely(grows & PROT_GROWSUP)) { | 369 | if (unlikely(grows & PROT_GROWSUP)) { |
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
291 | for (nstart = start ; ; ) { | 379 | for (nstart = start ; ; ) { |
292 | unsigned long newflags; | 380 | unsigned long newflags; |
293 | 381 | ||
294 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 382 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
295 | 383 | ||
296 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | 384 | newflags = vm_flags; |
385 | newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | ||
297 | 386 | ||
298 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ | 387 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ |
299 | if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { | 388 | if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { |
diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307a..e1031e1f6a61 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
104 | } | 104 | } |
105 | if (vma->anon_vma) { | 105 | if (vma->anon_vma) { |
106 | anon_vma = vma->anon_vma; | 106 | anon_vma = vma->anon_vma; |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | } | 108 | } |
109 | } | 109 | } |
110 | 110 | ||
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
182 | need_flush = true; | 182 | need_flush = true; |
183 | continue; | 183 | continue; |
184 | } else if (!err) { | 184 | } else if (!err) { |
185 | split_huge_page_pmd(vma->vm_mm, old_pmd); | 185 | split_huge_page_pmd(vma, old_addr, old_pmd); |
186 | } | 186 | } |
187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | 187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); |
188 | } | 188 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index ecc2f13d557d..03d152a76acf 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
137 | return count; | 137 | return count; |
138 | } | 138 | } |
139 | 139 | ||
140 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
141 | { | ||
142 | struct zone *z; | ||
143 | |||
144 | /* | ||
145 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
146 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
147 | * zones. So there's no need to recalculate managed_pages because all | ||
148 | * highmem pages will be managed by the buddy system. Here highmem | ||
149 | * zone also includes highmem movable zone. | ||
150 | */ | ||
151 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
152 | if (!is_highmem(z)) | ||
153 | z->managed_pages = 0; | ||
154 | } | ||
155 | |||
140 | /** | 156 | /** |
141 | * free_all_bootmem - release free pages to the buddy allocator | 157 | * free_all_bootmem - release free pages to the buddy allocator |
142 | * | 158 | * |
@@ -144,6 +160,11 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
144 | */ | 160 | */ |
145 | unsigned long __init free_all_bootmem(void) | 161 | unsigned long __init free_all_bootmem(void) |
146 | { | 162 | { |
163 | struct pglist_data *pgdat; | ||
164 | |||
165 | for_each_online_pgdat(pgdat) | ||
166 | reset_node_lowmem_managed_pages(pgdat); | ||
167 | |||
147 | /* | 168 | /* |
148 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 169 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
149 | * because in some case like Node0 doesn't have RAM installed | 170 | * because in some case like Node0 doesn't have RAM installed |
diff --git a/mm/nommu.c b/mm/nommu.c index 45131b41bcdb..79c3cac87afa 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -66,6 +66,21 @@ int heap_stack_gap = 0; | |||
66 | 66 | ||
67 | atomic_long_t mmap_pages_allocated; | 67 | atomic_long_t mmap_pages_allocated; |
68 | 68 | ||
69 | /* | ||
70 | * The global memory commitment made in the system can be a metric | ||
71 | * that can be used to drive ballooning decisions when Linux is hosted | ||
72 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically | ||
73 | * balancing memory across competing virtual machines that are hosted. | ||
74 | * Several metrics drive this policy engine including the guest reported | ||
75 | * memory commitment. | ||
76 | */ | ||
77 | unsigned long vm_memory_committed(void) | ||
78 | { | ||
79 | return percpu_counter_read_positive(&vm_committed_as); | ||
80 | } | ||
81 | |||
82 | EXPORT_SYMBOL_GPL(vm_memory_committed); | ||
83 | |||
69 | EXPORT_SYMBOL(mem_map); | 84 | EXPORT_SYMBOL(mem_map); |
70 | EXPORT_SYMBOL(num_physpages); | 85 | EXPORT_SYMBOL(num_physpages); |
71 | 86 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 79e0f3e24831..0399f146ae49 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task; | |||
44 | int sysctl_oom_dump_tasks = 1; | 44 | int sysctl_oom_dump_tasks = 1; |
45 | static DEFINE_SPINLOCK(zone_scan_lock); | 45 | static DEFINE_SPINLOCK(zone_scan_lock); |
46 | 46 | ||
47 | /* | ||
48 | * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj | ||
49 | * @old_val: old oom_score_adj for compare | ||
50 | * @new_val: new oom_score_adj for swap | ||
51 | * | ||
52 | * Sets the oom_score_adj value for current to @new_val iff its present value is | ||
53 | * @old_val. Usually used to reinstate a previous value to prevent racing with | ||
54 | * userspacing tuning the value in the interim. | ||
55 | */ | ||
56 | void compare_swap_oom_score_adj(int old_val, int new_val) | ||
57 | { | ||
58 | struct sighand_struct *sighand = current->sighand; | ||
59 | |||
60 | spin_lock_irq(&sighand->siglock); | ||
61 | if (current->signal->oom_score_adj == old_val) | ||
62 | current->signal->oom_score_adj = new_val; | ||
63 | trace_oom_score_adj_update(current); | ||
64 | spin_unlock_irq(&sighand->siglock); | ||
65 | } | ||
66 | |||
67 | /** | ||
68 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value | ||
69 | * @new_val: new oom_score_adj value | ||
70 | * | ||
71 | * Sets the oom_score_adj value for current to @new_val with proper | ||
72 | * synchronization and returns the old value. Usually used to temporarily | ||
73 | * set a value, save the old value in the caller, and then reinstate it later. | ||
74 | */ | ||
75 | int test_set_oom_score_adj(int new_val) | ||
76 | { | ||
77 | struct sighand_struct *sighand = current->sighand; | ||
78 | int old_val; | ||
79 | |||
80 | spin_lock_irq(&sighand->siglock); | ||
81 | old_val = current->signal->oom_score_adj; | ||
82 | current->signal->oom_score_adj = new_val; | ||
83 | trace_oom_score_adj_update(current); | ||
84 | spin_unlock_irq(&sighand->siglock); | ||
85 | |||
86 | return old_val; | ||
87 | } | ||
88 | |||
89 | #ifdef CONFIG_NUMA | 47 | #ifdef CONFIG_NUMA |
90 | /** | 48 | /** |
91 | * has_intersects_mems_allowed() - check task eligiblity for kill | 49 | * has_intersects_mems_allowed() - check task eligiblity for kill |
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
193 | if (!p) | 151 | if (!p) |
194 | return 0; | 152 | return 0; |
195 | 153 | ||
196 | adj = p->signal->oom_score_adj; | 154 | adj = (long)p->signal->oom_score_adj; |
197 | if (adj == OOM_SCORE_ADJ_MIN) { | 155 | if (adj == OOM_SCORE_ADJ_MIN) { |
198 | task_unlock(p); | 156 | task_unlock(p); |
199 | return 0; | 157 | return 0; |
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
257 | * the page allocator means a mempolicy is in effect. Cpuset policy | 215 | * the page allocator means a mempolicy is in effect. Cpuset policy |
258 | * is enforced in get_page_from_freelist(). | 216 | * is enforced in get_page_from_freelist(). |
259 | */ | 217 | */ |
260 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { | 218 | if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { |
261 | *totalpages = total_swap_pages; | 219 | *totalpages = total_swap_pages; |
262 | for_each_node_mask(nid, *nodemask) | 220 | for_each_node_mask(nid, *nodemask) |
263 | *totalpages += node_spanned_pages(nid); | 221 | *totalpages += node_spanned_pages(nid); |
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
310 | if (!task->mm) | 268 | if (!task->mm) |
311 | return OOM_SCAN_CONTINUE; | 269 | return OOM_SCAN_CONTINUE; |
312 | 270 | ||
313 | if (task->flags & PF_EXITING) { | 271 | /* |
272 | * If task is allocating a lot of memory and has been marked to be | ||
273 | * killed first if it triggers an oom, then select it. | ||
274 | */ | ||
275 | if (oom_task_origin(task)) | ||
276 | return OOM_SCAN_SELECT; | ||
277 | |||
278 | if (task->flags & PF_EXITING && !force_kill) { | ||
314 | /* | 279 | /* |
315 | * If task is current and is in the process of releasing memory, | 280 | * If this task is not being ptraced on exit, then wait for it |
316 | * allow the "kill" to set TIF_MEMDIE, which will allow it to | 281 | * to finish before killing some other task unnecessarily. |
317 | * access memory reserves. Otherwise, it may stall forever. | ||
318 | * | ||
319 | * The iteration isn't broken here, however, in case other | ||
320 | * threads are found to have already been oom killed. | ||
321 | */ | 282 | */ |
322 | if (task == current) | 283 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) |
323 | return OOM_SCAN_SELECT; | 284 | return OOM_SCAN_ABORT; |
324 | else if (!force_kill) { | ||
325 | /* | ||
326 | * If this task is not being ptraced on exit, then wait | ||
327 | * for it to finish before killing some other task | ||
328 | * unnecessarily. | ||
329 | */ | ||
330 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
331 | return OOM_SCAN_ABORT; | ||
332 | } | ||
333 | } | 285 | } |
334 | return OOM_SCAN_OK; | 286 | return OOM_SCAN_OK; |
335 | } | 287 | } |
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
412 | continue; | 364 | continue; |
413 | } | 365 | } |
414 | 366 | ||
415 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", | 367 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", |
416 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 368 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
417 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 369 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
418 | task->mm->nr_ptes, | 370 | task->mm->nr_ptes, |
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
428 | { | 380 | { |
429 | task_lock(current); | 381 | task_lock(current); |
430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 382 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
431 | "oom_score_adj=%d\n", | 383 | "oom_score_adj=%hd\n", |
432 | current->comm, gfp_mask, order, | 384 | current->comm, gfp_mask, order, |
433 | current->signal->oom_score_adj); | 385 | current->signal->oom_score_adj); |
434 | cpuset_print_task_mems_allowed(current); | 386 | cpuset_print_task_mems_allowed(current); |
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
639 | spin_unlock(&zone_scan_lock); | 591 | spin_unlock(&zone_scan_lock); |
640 | } | 592 | } |
641 | 593 | ||
642 | /* | ||
643 | * Try to acquire the oom killer lock for all system zones. Returns zero if a | ||
644 | * parallel oom killing is taking place, otherwise locks all zones and returns | ||
645 | * non-zero. | ||
646 | */ | ||
647 | static int try_set_system_oom(void) | ||
648 | { | ||
649 | struct zone *zone; | ||
650 | int ret = 1; | ||
651 | |||
652 | spin_lock(&zone_scan_lock); | ||
653 | for_each_populated_zone(zone) | ||
654 | if (zone_is_oom_locked(zone)) { | ||
655 | ret = 0; | ||
656 | goto out; | ||
657 | } | ||
658 | for_each_populated_zone(zone) | ||
659 | zone_set_flag(zone, ZONE_OOM_LOCKED); | ||
660 | out: | ||
661 | spin_unlock(&zone_scan_lock); | ||
662 | return ret; | ||
663 | } | ||
664 | |||
665 | /* | ||
666 | * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation | ||
667 | * attempts or page faults may now recall the oom killer, if necessary. | ||
668 | */ | ||
669 | static void clear_system_oom(void) | ||
670 | { | ||
671 | struct zone *zone; | ||
672 | |||
673 | spin_lock(&zone_scan_lock); | ||
674 | for_each_populated_zone(zone) | ||
675 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | ||
676 | spin_unlock(&zone_scan_lock); | ||
677 | } | ||
678 | |||
679 | /** | 594 | /** |
680 | * out_of_memory - kill the "best" process when we run out of memory | 595 | * out_of_memory - kill the "best" process when we run out of memory |
681 | * @zonelist: zonelist pointer | 596 | * @zonelist: zonelist pointer |
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
706 | return; | 621 | return; |
707 | 622 | ||
708 | /* | 623 | /* |
709 | * If current has a pending SIGKILL, then automatically select it. The | 624 | * If current has a pending SIGKILL or is exiting, then automatically |
710 | * goal is to allow it to allocate so that it may quickly exit and free | 625 | * select it. The goal is to allow it to allocate so that it may |
711 | * its memory. | 626 | * quickly exit and free its memory. |
712 | */ | 627 | */ |
713 | if (fatal_signal_pending(current)) { | 628 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { |
714 | set_thread_flag(TIF_MEMDIE); | 629 | set_thread_flag(TIF_MEMDIE); |
715 | return; | 630 | return; |
716 | } | 631 | } |
@@ -756,15 +671,16 @@ out: | |||
756 | 671 | ||
757 | /* | 672 | /* |
758 | * The pagefault handler calls here because it is out of memory, so kill a | 673 | * The pagefault handler calls here because it is out of memory, so kill a |
759 | * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel | 674 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a |
760 | * oom killing is already in progress so do nothing. If a task is found with | 675 | * parallel oom killing is already in progress so do nothing. |
761 | * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. | ||
762 | */ | 676 | */ |
763 | void pagefault_out_of_memory(void) | 677 | void pagefault_out_of_memory(void) |
764 | { | 678 | { |
765 | if (try_set_system_oom()) { | 679 | struct zonelist *zonelist = node_zonelist(first_online_node, |
680 | GFP_KERNEL); | ||
681 | |||
682 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | ||
766 | out_of_memory(NULL, 0, 0, NULL, false); | 683 | out_of_memory(NULL, 0, 0, NULL, false); |
767 | clear_system_oom(); | 684 | clear_zonelist_oom(zonelist, GFP_KERNEL); |
768 | } | 685 | } |
769 | schedule_timeout_killable(1); | ||
770 | } | 686 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 830893b2b3c7..0713bfbf0954 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -201,6 +201,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
201 | zone_reclaimable_pages(z) - z->dirty_balance_reserve; | 201 | zone_reclaimable_pages(z) - z->dirty_balance_reserve; |
202 | } | 202 | } |
203 | /* | 203 | /* |
204 | * Unreclaimable memory (kernel memory or anonymous memory | ||
205 | * without swap) can bring down the dirtyable pages below | ||
206 | * the zone's dirty balance reserve and the above calculation | ||
207 | * will underflow. However we still want to add in nodes | ||
208 | * which are below threshold (negative values) to get a more | ||
209 | * accurate calculation but make sure that the total never | ||
210 | * underflows. | ||
211 | */ | ||
212 | if ((long)x < 0) | ||
213 | x = 0; | ||
214 | |||
215 | /* | ||
204 | * Make sure that the number of highmem pages is never larger | 216 | * Make sure that the number of highmem pages is never larger |
205 | * than the number of the total dirtyable memory. This can only | 217 | * than the number of the total dirtyable memory. This can only |
206 | * occur in very strange VM situations but we want to make sure | 218 | * occur in very strange VM situations but we want to make sure |
@@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void) | |||
222 | { | 234 | { |
223 | unsigned long x; | 235 | unsigned long x; |
224 | 236 | ||
225 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - | 237 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); |
226 | dirty_balance_reserve; | 238 | x -= min(x, dirty_balance_reserve); |
227 | 239 | ||
228 | if (!vm_highmem_is_dirtyable) | 240 | if (!vm_highmem_is_dirtyable) |
229 | x -= highmem_dirtyable_memory(x); | 241 | x -= highmem_dirtyable_memory(x); |
@@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) | |||
290 | * highmem zone can hold its share of dirty pages, so we don't | 302 | * highmem zone can hold its share of dirty pages, so we don't |
291 | * care about vm_highmem_is_dirtyable here. | 303 | * care about vm_highmem_is_dirtyable here. |
292 | */ | 304 | */ |
293 | return zone_page_state(zone, NR_FREE_PAGES) + | 305 | unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + |
294 | zone_reclaimable_pages(zone) - | 306 | zone_reclaimable_pages(zone); |
295 | zone->dirty_balance_reserve; | 307 | |
308 | /* don't allow this to underflow */ | ||
309 | nr_pages -= min(nr_pages, zone->dirty_balance_reserve); | ||
310 | return nr_pages; | ||
296 | } | 311 | } |
297 | 312 | ||
298 | /** | 313 | /** |
@@ -1069,7 +1084,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
1069 | } | 1084 | } |
1070 | 1085 | ||
1071 | /* | 1086 | /* |
1072 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() | 1087 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited() |
1073 | * will look to see if it needs to start dirty throttling. | 1088 | * will look to see if it needs to start dirty throttling. |
1074 | * | 1089 | * |
1075 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive | 1090 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive |
@@ -1436,9 +1451,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); | |||
1436 | DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | 1451 | DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; |
1437 | 1452 | ||
1438 | /** | 1453 | /** |
1439 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 1454 | * balance_dirty_pages_ratelimited - balance dirty memory state |
1440 | * @mapping: address_space which was dirtied | 1455 | * @mapping: address_space which was dirtied |
1441 | * @nr_pages_dirtied: number of pages which the caller has just dirtied | ||
1442 | * | 1456 | * |
1443 | * Processes which are dirtying memory should call in here once for each page | 1457 | * Processes which are dirtying memory should call in here once for each page |
1444 | * which was newly dirtied. The function will periodically check the system's | 1458 | * which was newly dirtied. The function will periodically check the system's |
@@ -1449,8 +1463,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | |||
1449 | * limit we decrease the ratelimiting by a lot, to prevent individual processes | 1463 | * limit we decrease the ratelimiting by a lot, to prevent individual processes |
1450 | * from overshooting the limit by (ratelimit_pages) each. | 1464 | * from overshooting the limit by (ratelimit_pages) each. |
1451 | */ | 1465 | */ |
1452 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 1466 | void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1453 | unsigned long nr_pages_dirtied) | ||
1454 | { | 1467 | { |
1455 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1468 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1456 | int ratelimit; | 1469 | int ratelimit; |
@@ -1484,6 +1497,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1484 | */ | 1497 | */ |
1485 | p = &__get_cpu_var(dirty_throttle_leaks); | 1498 | p = &__get_cpu_var(dirty_throttle_leaks); |
1486 | if (*p > 0 && current->nr_dirtied < ratelimit) { | 1499 | if (*p > 0 && current->nr_dirtied < ratelimit) { |
1500 | unsigned long nr_pages_dirtied; | ||
1487 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); | 1501 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); |
1488 | *p -= nr_pages_dirtied; | 1502 | *p -= nr_pages_dirtied; |
1489 | current->nr_dirtied += nr_pages_dirtied; | 1503 | current->nr_dirtied += nr_pages_dirtied; |
@@ -1493,7 +1507,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1493 | if (unlikely(current->nr_dirtied >= ratelimit)) | 1507 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1494 | balance_dirty_pages(mapping, current->nr_dirtied); | 1508 | balance_dirty_pages(mapping, current->nr_dirtied); |
1495 | } | 1509 | } |
1496 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | 1510 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
1497 | 1511 | ||
1498 | void throttle_vm_writeout(gfp_t gfp_mask) | 1512 | void throttle_vm_writeout(gfp_t gfp_mask) |
1499 | { | 1513 | { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7bb35ac0964a..df2022ff0c8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
90 | #ifdef CONFIG_HIGHMEM | 90 | #ifdef CONFIG_HIGHMEM |
91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_MOVABLE_NODE | ||
94 | [N_MEMORY] = { { [0] = 1UL } }, | ||
95 | #endif | ||
93 | [N_CPU] = { { [0] = 1UL } }, | 96 | [N_CPU] = { { [0] = 1UL } }, |
94 | #endif /* NUMA */ | 97 | #endif /* NUMA */ |
95 | }; | 98 | }; |
@@ -218,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
218 | 221 | ||
219 | int page_group_by_mobility_disabled __read_mostly; | 222 | int page_group_by_mobility_disabled __read_mostly; |
220 | 223 | ||
221 | /* | ||
222 | * NOTE: | ||
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | ||
224 | * Instead, use {un}set_pageblock_isolate. | ||
225 | */ | ||
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | 224 | void set_pageblock_migratetype(struct page *page, int migratetype) |
227 | { | 225 | { |
228 | 226 | ||
@@ -368,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
368 | int nr_pages = 1 << order; | 366 | int nr_pages = 1 << order; |
369 | int bad = 0; | 367 | int bad = 0; |
370 | 368 | ||
371 | if (unlikely(compound_order(page) != order) || | 369 | if (unlikely(compound_order(page) != order)) { |
372 | unlikely(!PageHead(page))) { | ||
373 | bad_page(page); | 370 | bad_page(page); |
374 | bad++; | 371 | bad++; |
375 | } | 372 | } |
@@ -523,7 +520,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
523 | * If a block is freed, and its buddy is also free, then this | 520 | * If a block is freed, and its buddy is also free, then this |
524 | * triggers coalescing into a block of larger size. | 521 | * triggers coalescing into a block of larger size. |
525 | * | 522 | * |
526 | * -- wli | 523 | * -- nyc |
527 | */ | 524 | */ |
528 | 525 | ||
529 | static inline void __free_one_page(struct page *page, | 526 | static inline void __free_one_page(struct page *page, |
@@ -608,6 +605,7 @@ static inline int free_pages_check(struct page *page) | |||
608 | bad_page(page); | 605 | bad_page(page); |
609 | return 1; | 606 | return 1; |
610 | } | 607 | } |
608 | reset_page_last_nid(page); | ||
611 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 609 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
612 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 610 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
613 | return 0; | 611 | return 0; |
@@ -667,11 +665,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 665 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
668 | __free_one_page(page, zone, 0, mt); | 666 | __free_one_page(page, zone, 0, mt); |
669 | trace_mm_page_pcpu_drain(page, 0, mt); | 667 | trace_mm_page_pcpu_drain(page, 0, mt); |
670 | if (is_migrate_cma(mt)) | 668 | if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { |
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | 669 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
670 | if (is_migrate_cma(mt)) | ||
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | ||
672 | } | ||
672 | } while (--to_free && --batch_free && !list_empty(list)); | 673 | } while (--to_free && --batch_free && !list_empty(list)); |
673 | } | 674 | } |
674 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | ||
675 | spin_unlock(&zone->lock); | 675 | spin_unlock(&zone->lock); |
676 | } | 676 | } |
677 | 677 | ||
@@ -730,6 +730,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
730 | local_irq_restore(flags); | 730 | local_irq_restore(flags); |
731 | } | 731 | } |
732 | 732 | ||
733 | /* | ||
734 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
735 | * but we still need to serialize writers. Currently all callers of | ||
736 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
737 | * at boot time. So for shorter boot time, we shift the burden to | ||
738 | * put_page_bootmem() to serialize writers. | ||
739 | */ | ||
733 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 740 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
734 | { | 741 | { |
735 | unsigned int nr_pages = 1 << order; | 742 | unsigned int nr_pages = 1 << order; |
@@ -745,6 +752,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
745 | set_page_count(p, 0); | 752 | set_page_count(p, 0); |
746 | } | 753 | } |
747 | 754 | ||
755 | page_zone(page)->managed_pages += 1 << order; | ||
748 | set_page_refcounted(page); | 756 | set_page_refcounted(page); |
749 | __free_pages(page, order); | 757 | __free_pages(page, order); |
750 | } | 758 | } |
@@ -780,7 +788,7 @@ void __init init_cma_reserved_pageblock(struct page *page) | |||
780 | * large block of memory acted on by a series of small allocations. | 788 | * large block of memory acted on by a series of small allocations. |
781 | * This behavior is a critical factor in sglist merging's success. | 789 | * This behavior is a critical factor in sglist merging's success. |
782 | * | 790 | * |
783 | * -- wli | 791 | * -- nyc |
784 | */ | 792 | */ |
785 | static inline void expand(struct zone *zone, struct page *page, | 793 | static inline void expand(struct zone *zone, struct page *page, |
786 | int low, int high, struct free_area *area, | 794 | int low, int high, struct free_area *area, |
@@ -1376,14 +1384,8 @@ void split_page(struct page *page, unsigned int order) | |||
1376 | set_page_refcounted(page + i); | 1384 | set_page_refcounted(page + i); |
1377 | } | 1385 | } |
1378 | 1386 | ||
1379 | /* | 1387 | static int __isolate_free_page(struct page *page, unsigned int order) |
1380 | * Similar to the split_page family of functions except that the page | ||
1381 | * required at the given order and being isolated now to prevent races | ||
1382 | * with parallel allocators | ||
1383 | */ | ||
1384 | int capture_free_page(struct page *page, int alloc_order, int migratetype) | ||
1385 | { | 1388 | { |
1386 | unsigned int order; | ||
1387 | unsigned long watermark; | 1389 | unsigned long watermark; |
1388 | struct zone *zone; | 1390 | struct zone *zone; |
1389 | int mt; | 1391 | int mt; |
@@ -1391,27 +1393,23 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1391 | BUG_ON(!PageBuddy(page)); | 1393 | BUG_ON(!PageBuddy(page)); |
1392 | 1394 | ||
1393 | zone = page_zone(page); | 1395 | zone = page_zone(page); |
1394 | order = page_order(page); | 1396 | mt = get_pageblock_migratetype(page); |
1395 | 1397 | ||
1396 | /* Obey watermarks as if the page was being allocated */ | 1398 | if (mt != MIGRATE_ISOLATE) { |
1397 | watermark = low_wmark_pages(zone) + (1 << order); | 1399 | /* Obey watermarks as if the page was being allocated */ |
1398 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1400 | watermark = low_wmark_pages(zone) + (1 << order); |
1399 | return 0; | 1401 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1402 | return 0; | ||
1403 | |||
1404 | __mod_zone_freepage_state(zone, -(1UL << order), mt); | ||
1405 | } | ||
1400 | 1406 | ||
1401 | /* Remove page from free list */ | 1407 | /* Remove page from free list */ |
1402 | list_del(&page->lru); | 1408 | list_del(&page->lru); |
1403 | zone->free_area[order].nr_free--; | 1409 | zone->free_area[order].nr_free--; |
1404 | rmv_page_order(page); | 1410 | rmv_page_order(page); |
1405 | 1411 | ||
1406 | mt = get_pageblock_migratetype(page); | 1412 | /* Set the pageblock if the isolated page is at least a pageblock */ |
1407 | if (unlikely(mt != MIGRATE_ISOLATE)) | ||
1408 | __mod_zone_freepage_state(zone, -(1UL << order), mt); | ||
1409 | |||
1410 | if (alloc_order != order) | ||
1411 | expand(zone, page, alloc_order, order, | ||
1412 | &zone->free_area[order], migratetype); | ||
1413 | |||
1414 | /* Set the pageblock if the captured page is at least a pageblock */ | ||
1415 | if (order >= pageblock_order - 1) { | 1413 | if (order >= pageblock_order - 1) { |
1416 | struct page *endpage = page + (1 << order) - 1; | 1414 | struct page *endpage = page + (1 << order) - 1; |
1417 | for (; page < endpage; page += pageblock_nr_pages) { | 1415 | for (; page < endpage; page += pageblock_nr_pages) { |
@@ -1440,10 +1438,9 @@ int split_free_page(struct page *page) | |||
1440 | unsigned int order; | 1438 | unsigned int order; |
1441 | int nr_pages; | 1439 | int nr_pages; |
1442 | 1440 | ||
1443 | BUG_ON(!PageBuddy(page)); | ||
1444 | order = page_order(page); | 1441 | order = page_order(page); |
1445 | 1442 | ||
1446 | nr_pages = capture_free_page(page, order, 0); | 1443 | nr_pages = __isolate_free_page(page, order); |
1447 | if (!nr_pages) | 1444 | if (!nr_pages) |
1448 | return 0; | 1445 | return 0; |
1449 | 1446 | ||
@@ -1641,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1641 | return true; | 1638 | return true; |
1642 | } | 1639 | } |
1643 | 1640 | ||
1644 | #ifdef CONFIG_MEMORY_ISOLATION | ||
1645 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1646 | { | ||
1647 | if (unlikely(zone->nr_pageblock_isolate)) | ||
1648 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | ||
1649 | return 0; | ||
1650 | } | ||
1651 | #else | ||
1652 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1653 | { | ||
1654 | return 0; | ||
1655 | } | ||
1656 | #endif | ||
1657 | |||
1658 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1659 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1660 | { | 1643 | { |
@@ -1670,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1670 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1671 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1672 | 1655 | ||
1673 | /* | ||
1674 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | ||
1675 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | ||
1676 | * sleep although it could do so. But this is more desirable for memory | ||
1677 | * hotplug than sleeping which can cause a livelock in the direct | ||
1678 | * reclaim path. | ||
1679 | */ | ||
1680 | free_pages -= nr_zone_isolate_freepages(z); | ||
1681 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1656 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1682 | free_pages); | 1657 | free_pages); |
1683 | } | 1658 | } |
@@ -1692,7 +1667,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1692 | * | 1667 | * |
1693 | * If the zonelist cache is present in the passed in zonelist, then | 1668 | * If the zonelist cache is present in the passed in zonelist, then |
1694 | * returns a pointer to the allowed node mask (either the current | 1669 | * returns a pointer to the allowed node mask (either the current |
1695 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1670 | * tasks mems_allowed, or node_states[N_MEMORY].) |
1696 | * | 1671 | * |
1697 | * If the zonelist cache is not available for this zonelist, does | 1672 | * If the zonelist cache is not available for this zonelist, does |
1698 | * nothing and returns NULL. | 1673 | * nothing and returns NULL. |
@@ -1721,7 +1696,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1721 | 1696 | ||
1722 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1697 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1723 | &cpuset_current_mems_allowed : | 1698 | &cpuset_current_mems_allowed : |
1724 | &node_states[N_HIGH_MEMORY]; | 1699 | &node_states[N_MEMORY]; |
1725 | return allowednodes; | 1700 | return allowednodes; |
1726 | } | 1701 | } |
1727 | 1702 | ||
@@ -1871,7 +1846,7 @@ zonelist_scan: | |||
1871 | */ | 1846 | */ |
1872 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1847 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1873 | high_zoneidx, nodemask) { | 1848 | high_zoneidx, nodemask) { |
1874 | if (NUMA_BUILD && zlc_active && | 1849 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1875 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1850 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1876 | continue; | 1851 | continue; |
1877 | if ((alloc_flags & ALLOC_CPUSET) && | 1852 | if ((alloc_flags & ALLOC_CPUSET) && |
@@ -1917,7 +1892,8 @@ zonelist_scan: | |||
1917 | classzone_idx, alloc_flags)) | 1892 | classzone_idx, alloc_flags)) |
1918 | goto try_this_zone; | 1893 | goto try_this_zone; |
1919 | 1894 | ||
1920 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | 1895 | if (IS_ENABLED(CONFIG_NUMA) && |
1896 | !did_zlc_setup && nr_online_nodes > 1) { | ||
1921 | /* | 1897 | /* |
1922 | * we do zlc_setup if there are multiple nodes | 1898 | * we do zlc_setup if there are multiple nodes |
1923 | * and before considering the first zone allowed | 1899 | * and before considering the first zone allowed |
@@ -1936,7 +1912,7 @@ zonelist_scan: | |||
1936 | * As we may have just activated ZLC, check if the first | 1912 | * As we may have just activated ZLC, check if the first |
1937 | * eligible zone has failed zone_reclaim recently. | 1913 | * eligible zone has failed zone_reclaim recently. |
1938 | */ | 1914 | */ |
1939 | if (NUMA_BUILD && zlc_active && | 1915 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1940 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1916 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1941 | continue; | 1917 | continue; |
1942 | 1918 | ||
@@ -1962,11 +1938,11 @@ try_this_zone: | |||
1962 | if (page) | 1938 | if (page) |
1963 | break; | 1939 | break; |
1964 | this_zone_full: | 1940 | this_zone_full: |
1965 | if (NUMA_BUILD) | 1941 | if (IS_ENABLED(CONFIG_NUMA)) |
1966 | zlc_mark_zone_full(zonelist, z); | 1942 | zlc_mark_zone_full(zonelist, z); |
1967 | } | 1943 | } |
1968 | 1944 | ||
1969 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1945 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { |
1970 | /* Disable zlc cache for second zonelist scan */ | 1946 | /* Disable zlc cache for second zonelist scan */ |
1971 | zlc_active = 0; | 1947 | zlc_active = 0; |
1972 | goto zonelist_scan; | 1948 | goto zonelist_scan; |
@@ -2148,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2148 | bool *contended_compaction, bool *deferred_compaction, | 2124 | bool *contended_compaction, bool *deferred_compaction, |
2149 | unsigned long *did_some_progress) | 2125 | unsigned long *did_some_progress) |
2150 | { | 2126 | { |
2151 | struct page *page = NULL; | ||
2152 | |||
2153 | if (!order) | 2127 | if (!order) |
2154 | return NULL; | 2128 | return NULL; |
2155 | 2129 | ||
@@ -2161,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2161 | current->flags |= PF_MEMALLOC; | 2135 | current->flags |= PF_MEMALLOC; |
2162 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2136 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2163 | nodemask, sync_migration, | 2137 | nodemask, sync_migration, |
2164 | contended_compaction, &page); | 2138 | contended_compaction); |
2165 | current->flags &= ~PF_MEMALLOC; | 2139 | current->flags &= ~PF_MEMALLOC; |
2166 | 2140 | ||
2167 | /* If compaction captured a page, prep and use it */ | ||
2168 | if (page) { | ||
2169 | prep_new_page(page, order, gfp_mask); | ||
2170 | goto got_page; | ||
2171 | } | ||
2172 | |||
2173 | if (*did_some_progress != COMPACT_SKIPPED) { | 2141 | if (*did_some_progress != COMPACT_SKIPPED) { |
2142 | struct page *page; | ||
2143 | |||
2174 | /* Page migration frees to the PCP lists but we want merging */ | 2144 | /* Page migration frees to the PCP lists but we want merging */ |
2175 | drain_pages(get_cpu()); | 2145 | drain_pages(get_cpu()); |
2176 | put_cpu(); | 2146 | put_cpu(); |
@@ -2180,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2180 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2150 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2181 | preferred_zone, migratetype); | 2151 | preferred_zone, migratetype); |
2182 | if (page) { | 2152 | if (page) { |
2183 | got_page: | ||
2184 | preferred_zone->compact_blockskip_flush = false; | 2153 | preferred_zone->compact_blockskip_flush = false; |
2185 | preferred_zone->compact_considered = 0; | 2154 | preferred_zone->compact_considered = 0; |
2186 | preferred_zone->compact_defer_shift = 0; | 2155 | preferred_zone->compact_defer_shift = 0; |
@@ -2266,7 +2235,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2266 | return NULL; | 2235 | return NULL; |
2267 | 2236 | ||
2268 | /* After successful reclaim, reconsider all zones for allocation */ | 2237 | /* After successful reclaim, reconsider all zones for allocation */ |
2269 | if (NUMA_BUILD) | 2238 | if (IS_ENABLED(CONFIG_NUMA)) |
2270 | zlc_clear_zones_full(zonelist); | 2239 | zlc_clear_zones_full(zonelist); |
2271 | 2240 | ||
2272 | retry: | 2241 | retry: |
@@ -2412,12 +2381,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2412 | * allowed per node queues are empty and that nodes are | 2381 | * allowed per node queues are empty and that nodes are |
2413 | * over allocated. | 2382 | * over allocated. |
2414 | */ | 2383 | */ |
2415 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2384 | if (IS_ENABLED(CONFIG_NUMA) && |
2385 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
2416 | goto nopage; | 2386 | goto nopage; |
2417 | 2387 | ||
2418 | restart: | 2388 | restart: |
2419 | wake_all_kswapd(order, zonelist, high_zoneidx, | 2389 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2420 | zone_idx(preferred_zone)); | 2390 | wake_all_kswapd(order, zonelist, high_zoneidx, |
2391 | zone_idx(preferred_zone)); | ||
2421 | 2392 | ||
2422 | /* | 2393 | /* |
2423 | * OK, we're below the kswapd watermark and have kicked background | 2394 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2494,7 +2465,7 @@ rebalance: | |||
2494 | * system then fail the allocation instead of entering direct reclaim. | 2465 | * system then fail the allocation instead of entering direct reclaim. |
2495 | */ | 2466 | */ |
2496 | if ((deferred_compaction || contended_compaction) && | 2467 | if ((deferred_compaction || contended_compaction) && |
2497 | (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) | 2468 | (gfp_mask & __GFP_NO_KSWAPD)) |
2498 | goto nopage; | 2469 | goto nopage; |
2499 | 2470 | ||
2500 | /* Try direct reclaim and then allocating */ | 2471 | /* Try direct reclaim and then allocating */ |
@@ -2595,6 +2566,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2595 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2566 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2596 | unsigned int cpuset_mems_cookie; | 2567 | unsigned int cpuset_mems_cookie; |
2597 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; | 2568 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; |
2569 | struct mem_cgroup *memcg = NULL; | ||
2598 | 2570 | ||
2599 | gfp_mask &= gfp_allowed_mask; | 2571 | gfp_mask &= gfp_allowed_mask; |
2600 | 2572 | ||
@@ -2613,6 +2585,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2613 | if (unlikely(!zonelist->_zonerefs->zone)) | 2585 | if (unlikely(!zonelist->_zonerefs->zone)) |
2614 | return NULL; | 2586 | return NULL; |
2615 | 2587 | ||
2588 | /* | ||
2589 | * Will only have any effect when __GFP_KMEMCG is set. This is | ||
2590 | * verified in the (always inline) callee | ||
2591 | */ | ||
2592 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2593 | return NULL; | ||
2594 | |||
2616 | retry_cpuset: | 2595 | retry_cpuset: |
2617 | cpuset_mems_cookie = get_mems_allowed(); | 2596 | cpuset_mems_cookie = get_mems_allowed(); |
2618 | 2597 | ||
@@ -2648,6 +2627,8 @@ out: | |||
2648 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2627 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
2649 | goto retry_cpuset; | 2628 | goto retry_cpuset; |
2650 | 2629 | ||
2630 | memcg_kmem_commit_charge(page, memcg, order); | ||
2631 | |||
2651 | return page; | 2632 | return page; |
2652 | } | 2633 | } |
2653 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2634 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2700,6 +2681,31 @@ void free_pages(unsigned long addr, unsigned int order) | |||
2700 | 2681 | ||
2701 | EXPORT_SYMBOL(free_pages); | 2682 | EXPORT_SYMBOL(free_pages); |
2702 | 2683 | ||
2684 | /* | ||
2685 | * __free_memcg_kmem_pages and free_memcg_kmem_pages will free | ||
2686 | * pages allocated with __GFP_KMEMCG. | ||
2687 | * | ||
2688 | * Those pages are accounted to a particular memcg, embedded in the | ||
2689 | * corresponding page_cgroup. To avoid adding a hit in the allocator to search | ||
2690 | * for that information only to find out that it is NULL for users who have no | ||
2691 | * interest in that whatsoever, we provide these functions. | ||
2692 | * | ||
2693 | * The caller knows better which flags it relies on. | ||
2694 | */ | ||
2695 | void __free_memcg_kmem_pages(struct page *page, unsigned int order) | ||
2696 | { | ||
2697 | memcg_kmem_uncharge_pages(page, order); | ||
2698 | __free_pages(page, order); | ||
2699 | } | ||
2700 | |||
2701 | void free_memcg_kmem_pages(unsigned long addr, unsigned int order) | ||
2702 | { | ||
2703 | if (addr != 0) { | ||
2704 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | ||
2705 | __free_memcg_kmem_pages(virt_to_page((void *)addr), order); | ||
2706 | } | ||
2707 | } | ||
2708 | |||
2703 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) | 2709 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) |
2704 | { | 2710 | { |
2705 | if (addr) { | 2711 | if (addr) { |
@@ -2818,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void) | |||
2818 | 2824 | ||
2819 | static inline void show_node(struct zone *zone) | 2825 | static inline void show_node(struct zone *zone) |
2820 | { | 2826 | { |
2821 | if (NUMA_BUILD) | 2827 | if (IS_ENABLED(CONFIG_NUMA)) |
2822 | printk("Node %d ", zone_to_nid(zone)); | 2828 | printk("Node %d ", zone_to_nid(zone)); |
2823 | } | 2829 | } |
2824 | 2830 | ||
@@ -2876,6 +2882,31 @@ out: | |||
2876 | 2882 | ||
2877 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2883 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2878 | 2884 | ||
2885 | static void show_migration_types(unsigned char type) | ||
2886 | { | ||
2887 | static const char types[MIGRATE_TYPES] = { | ||
2888 | [MIGRATE_UNMOVABLE] = 'U', | ||
2889 | [MIGRATE_RECLAIMABLE] = 'E', | ||
2890 | [MIGRATE_MOVABLE] = 'M', | ||
2891 | [MIGRATE_RESERVE] = 'R', | ||
2892 | #ifdef CONFIG_CMA | ||
2893 | [MIGRATE_CMA] = 'C', | ||
2894 | #endif | ||
2895 | [MIGRATE_ISOLATE] = 'I', | ||
2896 | }; | ||
2897 | char tmp[MIGRATE_TYPES + 1]; | ||
2898 | char *p = tmp; | ||
2899 | int i; | ||
2900 | |||
2901 | for (i = 0; i < MIGRATE_TYPES; i++) { | ||
2902 | if (type & (1 << i)) | ||
2903 | *p++ = types[i]; | ||
2904 | } | ||
2905 | |||
2906 | *p = '\0'; | ||
2907 | printk("(%s) ", tmp); | ||
2908 | } | ||
2909 | |||
2879 | /* | 2910 | /* |
2880 | * Show free area list (used inside shift_scroll-lock stuff) | 2911 | * Show free area list (used inside shift_scroll-lock stuff) |
2881 | * We also calculate the percentage fragmentation. We do this by counting the | 2912 | * We also calculate the percentage fragmentation. We do this by counting the |
@@ -2950,6 +2981,7 @@ void show_free_areas(unsigned int filter) | |||
2950 | " isolated(anon):%lukB" | 2981 | " isolated(anon):%lukB" |
2951 | " isolated(file):%lukB" | 2982 | " isolated(file):%lukB" |
2952 | " present:%lukB" | 2983 | " present:%lukB" |
2984 | " managed:%lukB" | ||
2953 | " mlocked:%lukB" | 2985 | " mlocked:%lukB" |
2954 | " dirty:%lukB" | 2986 | " dirty:%lukB" |
2955 | " writeback:%lukB" | 2987 | " writeback:%lukB" |
@@ -2979,6 +3011,7 @@ void show_free_areas(unsigned int filter) | |||
2979 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 3011 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
2980 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 3012 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
2981 | K(zone->present_pages), | 3013 | K(zone->present_pages), |
3014 | K(zone->managed_pages), | ||
2982 | K(zone_page_state(zone, NR_MLOCK)), | 3015 | K(zone_page_state(zone, NR_MLOCK)), |
2983 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 3016 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
2984 | K(zone_page_state(zone, NR_WRITEBACK)), | 3017 | K(zone_page_state(zone, NR_WRITEBACK)), |
@@ -3004,6 +3037,7 @@ void show_free_areas(unsigned int filter) | |||
3004 | 3037 | ||
3005 | for_each_populated_zone(zone) { | 3038 | for_each_populated_zone(zone) { |
3006 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 3039 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
3040 | unsigned char types[MAX_ORDER]; | ||
3007 | 3041 | ||
3008 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 3042 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
3009 | continue; | 3043 | continue; |
@@ -3012,12 +3046,24 @@ void show_free_areas(unsigned int filter) | |||
3012 | 3046 | ||
3013 | spin_lock_irqsave(&zone->lock, flags); | 3047 | spin_lock_irqsave(&zone->lock, flags); |
3014 | for (order = 0; order < MAX_ORDER; order++) { | 3048 | for (order = 0; order < MAX_ORDER; order++) { |
3015 | nr[order] = zone->free_area[order].nr_free; | 3049 | struct free_area *area = &zone->free_area[order]; |
3050 | int type; | ||
3051 | |||
3052 | nr[order] = area->nr_free; | ||
3016 | total += nr[order] << order; | 3053 | total += nr[order] << order; |
3054 | |||
3055 | types[order] = 0; | ||
3056 | for (type = 0; type < MIGRATE_TYPES; type++) { | ||
3057 | if (!list_empty(&area->free_list[type])) | ||
3058 | types[order] |= 1 << type; | ||
3059 | } | ||
3017 | } | 3060 | } |
3018 | spin_unlock_irqrestore(&zone->lock, flags); | 3061 | spin_unlock_irqrestore(&zone->lock, flags); |
3019 | for (order = 0; order < MAX_ORDER; order++) | 3062 | for (order = 0; order < MAX_ORDER; order++) { |
3020 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | 3063 | printk("%lu*%lukB ", nr[order], K(1UL) << order); |
3064 | if (nr[order]) | ||
3065 | show_migration_types(types[order]); | ||
3066 | } | ||
3021 | printk("= %lukB\n", K(total)); | 3067 | printk("= %lukB\n", K(total)); |
3022 | } | 3068 | } |
3023 | 3069 | ||
@@ -3194,7 +3240,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
3194 | return node; | 3240 | return node; |
3195 | } | 3241 | } |
3196 | 3242 | ||
3197 | for_each_node_state(n, N_HIGH_MEMORY) { | 3243 | for_each_node_state(n, N_MEMORY) { |
3198 | 3244 | ||
3199 | /* Don't want a node to appear more than once */ | 3245 | /* Don't want a node to appear more than once */ |
3200 | if (node_isset(n, *used_node_mask)) | 3246 | if (node_isset(n, *used_node_mask)) |
@@ -3336,7 +3382,7 @@ static int default_zonelist_order(void) | |||
3336 | * local memory, NODE_ORDER may be suitable. | 3382 | * local memory, NODE_ORDER may be suitable. |
3337 | */ | 3383 | */ |
3338 | average_size = total_size / | 3384 | average_size = total_size / |
3339 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3385 | (nodes_weight(node_states[N_MEMORY]) + 1); |
3340 | for_each_online_node(nid) { | 3386 | for_each_online_node(nid) { |
3341 | low_kmem_size = 0; | 3387 | low_kmem_size = 0; |
3342 | total_size = 0; | 3388 | total_size = 0; |
@@ -3826,6 +3872,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3826 | mminit_verify_page_links(page, zone, nid, pfn); | 3872 | mminit_verify_page_links(page, zone, nid, pfn); |
3827 | init_page_count(page); | 3873 | init_page_count(page); |
3828 | reset_page_mapcount(page); | 3874 | reset_page_mapcount(page); |
3875 | reset_page_last_nid(page); | ||
3829 | SetPageReserved(page); | 3876 | SetPageReserved(page); |
3830 | /* | 3877 | /* |
3831 | * Mark the block movable so that blocks are reserved for | 3878 | * Mark the block movable so that blocks are reserved for |
@@ -4432,6 +4479,26 @@ void __init set_pageblock_order(void) | |||
4432 | 4479 | ||
4433 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4480 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4434 | 4481 | ||
4482 | static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | ||
4483 | unsigned long present_pages) | ||
4484 | { | ||
4485 | unsigned long pages = spanned_pages; | ||
4486 | |||
4487 | /* | ||
4488 | * Provide a more accurate estimation if there are holes within | ||
4489 | * the zone and SPARSEMEM is in use. If there are holes within the | ||
4490 | * zone, each populated memory region may cost us one or two extra | ||
4491 | * memmap pages due to alignment because memmap pages for each | ||
4492 | * populated regions may not naturally algined on page boundary. | ||
4493 | * So the (present_pages >> 4) heuristic is a tradeoff for that. | ||
4494 | */ | ||
4495 | if (spanned_pages > present_pages + (present_pages >> 4) && | ||
4496 | IS_ENABLED(CONFIG_SPARSEMEM)) | ||
4497 | pages = present_pages; | ||
4498 | |||
4499 | return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; | ||
4500 | } | ||
4501 | |||
4435 | /* | 4502 | /* |
4436 | * Set up the zone data structures: | 4503 | * Set up the zone data structures: |
4437 | * - mark all pages reserved | 4504 | * - mark all pages reserved |
@@ -4449,54 +4516,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4449 | int ret; | 4516 | int ret; |
4450 | 4517 | ||
4451 | pgdat_resize_init(pgdat); | 4518 | pgdat_resize_init(pgdat); |
4519 | #ifdef CONFIG_NUMA_BALANCING | ||
4520 | spin_lock_init(&pgdat->numabalancing_migrate_lock); | ||
4521 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
4522 | pgdat->numabalancing_migrate_next_window = jiffies; | ||
4523 | #endif | ||
4452 | init_waitqueue_head(&pgdat->kswapd_wait); | 4524 | init_waitqueue_head(&pgdat->kswapd_wait); |
4453 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4525 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4454 | pgdat_page_cgroup_init(pgdat); | 4526 | pgdat_page_cgroup_init(pgdat); |
4455 | 4527 | ||
4456 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4528 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4457 | struct zone *zone = pgdat->node_zones + j; | 4529 | struct zone *zone = pgdat->node_zones + j; |
4458 | unsigned long size, realsize, memmap_pages; | 4530 | unsigned long size, realsize, freesize, memmap_pages; |
4459 | 4531 | ||
4460 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4532 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4461 | realsize = size - zone_absent_pages_in_node(nid, j, | 4533 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, |
4462 | zholes_size); | 4534 | zholes_size); |
4463 | 4535 | ||
4464 | /* | 4536 | /* |
4465 | * Adjust realsize so that it accounts for how much memory | 4537 | * Adjust freesize so that it accounts for how much memory |
4466 | * is used by this zone for memmap. This affects the watermark | 4538 | * is used by this zone for memmap. This affects the watermark |
4467 | * and per-cpu initialisations | 4539 | * and per-cpu initialisations |
4468 | */ | 4540 | */ |
4469 | memmap_pages = | 4541 | memmap_pages = calc_memmap_size(size, realsize); |
4470 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4542 | if (freesize >= memmap_pages) { |
4471 | if (realsize >= memmap_pages) { | 4543 | freesize -= memmap_pages; |
4472 | realsize -= memmap_pages; | ||
4473 | if (memmap_pages) | 4544 | if (memmap_pages) |
4474 | printk(KERN_DEBUG | 4545 | printk(KERN_DEBUG |
4475 | " %s zone: %lu pages used for memmap\n", | 4546 | " %s zone: %lu pages used for memmap\n", |
4476 | zone_names[j], memmap_pages); | 4547 | zone_names[j], memmap_pages); |
4477 | } else | 4548 | } else |
4478 | printk(KERN_WARNING | 4549 | printk(KERN_WARNING |
4479 | " %s zone: %lu pages exceeds realsize %lu\n", | 4550 | " %s zone: %lu pages exceeds freesize %lu\n", |
4480 | zone_names[j], memmap_pages, realsize); | 4551 | zone_names[j], memmap_pages, freesize); |
4481 | 4552 | ||
4482 | /* Account for reserved pages */ | 4553 | /* Account for reserved pages */ |
4483 | if (j == 0 && realsize > dma_reserve) { | 4554 | if (j == 0 && freesize > dma_reserve) { |
4484 | realsize -= dma_reserve; | 4555 | freesize -= dma_reserve; |
4485 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4556 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
4486 | zone_names[0], dma_reserve); | 4557 | zone_names[0], dma_reserve); |
4487 | } | 4558 | } |
4488 | 4559 | ||
4489 | if (!is_highmem_idx(j)) | 4560 | if (!is_highmem_idx(j)) |
4490 | nr_kernel_pages += realsize; | 4561 | nr_kernel_pages += freesize; |
4491 | nr_all_pages += realsize; | 4562 | /* Charge for highmem memmap if there are enough kernel pages */ |
4563 | else if (nr_kernel_pages > memmap_pages * 2) | ||
4564 | nr_kernel_pages -= memmap_pages; | ||
4565 | nr_all_pages += freesize; | ||
4492 | 4566 | ||
4493 | zone->spanned_pages = size; | 4567 | zone->spanned_pages = size; |
4494 | zone->present_pages = realsize; | 4568 | zone->present_pages = freesize; |
4569 | /* | ||
4570 | * Set an approximate value for lowmem here, it will be adjusted | ||
4571 | * when the bootmem allocator frees pages into the buddy system. | ||
4572 | * And all highmem pages will be managed by the buddy system. | ||
4573 | */ | ||
4574 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; | ||
4495 | #ifdef CONFIG_NUMA | 4575 | #ifdef CONFIG_NUMA |
4496 | zone->node = nid; | 4576 | zone->node = nid; |
4497 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4577 | zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) |
4498 | / 100; | 4578 | / 100; |
4499 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4579 | zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; |
4500 | #endif | 4580 | #endif |
4501 | zone->name = zone_names[j]; | 4581 | zone->name = zone_names[j]; |
4502 | spin_lock_init(&zone->lock); | 4582 | spin_lock_init(&zone->lock); |
@@ -4687,7 +4767,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
4687 | /* | 4767 | /* |
4688 | * early_calculate_totalpages() | 4768 | * early_calculate_totalpages() |
4689 | * Sum pages in active regions for movable zone. | 4769 | * Sum pages in active regions for movable zone. |
4690 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4770 | * Populate N_MEMORY for calculating usable_nodes. |
4691 | */ | 4771 | */ |
4692 | static unsigned long __init early_calculate_totalpages(void) | 4772 | static unsigned long __init early_calculate_totalpages(void) |
4693 | { | 4773 | { |
@@ -4700,7 +4780,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4700 | 4780 | ||
4701 | totalpages += pages; | 4781 | totalpages += pages; |
4702 | if (pages) | 4782 | if (pages) |
4703 | node_set_state(nid, N_HIGH_MEMORY); | 4783 | node_set_state(nid, N_MEMORY); |
4704 | } | 4784 | } |
4705 | return totalpages; | 4785 | return totalpages; |
4706 | } | 4786 | } |
@@ -4717,9 +4797,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4717 | unsigned long usable_startpfn; | 4797 | unsigned long usable_startpfn; |
4718 | unsigned long kernelcore_node, kernelcore_remaining; | 4798 | unsigned long kernelcore_node, kernelcore_remaining; |
4719 | /* save the state before borrow the nodemask */ | 4799 | /* save the state before borrow the nodemask */ |
4720 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4800 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
4721 | unsigned long totalpages = early_calculate_totalpages(); | 4801 | unsigned long totalpages = early_calculate_totalpages(); |
4722 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4802 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
4723 | 4803 | ||
4724 | /* | 4804 | /* |
4725 | * If movablecore was specified, calculate what size of | 4805 | * If movablecore was specified, calculate what size of |
@@ -4754,7 +4834,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4754 | restart: | 4834 | restart: |
4755 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4835 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
4756 | kernelcore_node = required_kernelcore / usable_nodes; | 4836 | kernelcore_node = required_kernelcore / usable_nodes; |
4757 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4837 | for_each_node_state(nid, N_MEMORY) { |
4758 | unsigned long start_pfn, end_pfn; | 4838 | unsigned long start_pfn, end_pfn; |
4759 | 4839 | ||
4760 | /* | 4840 | /* |
@@ -4846,23 +4926,27 @@ restart: | |||
4846 | 4926 | ||
4847 | out: | 4927 | out: |
4848 | /* restore the node_state */ | 4928 | /* restore the node_state */ |
4849 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4929 | node_states[N_MEMORY] = saved_node_state; |
4850 | } | 4930 | } |
4851 | 4931 | ||
4852 | /* Any regular memory on that node ? */ | 4932 | /* Any regular or high memory on that node ? */ |
4853 | static void __init check_for_regular_memory(pg_data_t *pgdat) | 4933 | static void check_for_memory(pg_data_t *pgdat, int nid) |
4854 | { | 4934 | { |
4855 | #ifdef CONFIG_HIGHMEM | ||
4856 | enum zone_type zone_type; | 4935 | enum zone_type zone_type; |
4857 | 4936 | ||
4858 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4937 | if (N_MEMORY == N_NORMAL_MEMORY) |
4938 | return; | ||
4939 | |||
4940 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | ||
4859 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4941 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4860 | if (zone->present_pages) { | 4942 | if (zone->present_pages) { |
4861 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4943 | node_set_state(nid, N_HIGH_MEMORY); |
4944 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | ||
4945 | zone_type <= ZONE_NORMAL) | ||
4946 | node_set_state(nid, N_NORMAL_MEMORY); | ||
4862 | break; | 4947 | break; |
4863 | } | 4948 | } |
4864 | } | 4949 | } |
4865 | #endif | ||
4866 | } | 4950 | } |
4867 | 4951 | ||
4868 | /** | 4952 | /** |
@@ -4945,8 +5029,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4945 | 5029 | ||
4946 | /* Any memory on that node */ | 5030 | /* Any memory on that node */ |
4947 | if (pgdat->node_present_pages) | 5031 | if (pgdat->node_present_pages) |
4948 | node_set_state(nid, N_HIGH_MEMORY); | 5032 | node_set_state(nid, N_MEMORY); |
4949 | check_for_regular_memory(pgdat); | 5033 | check_for_memory(pgdat, nid); |
4950 | } | 5034 | } |
4951 | } | 5035 | } |
4952 | 5036 | ||
@@ -5174,10 +5258,6 @@ static void __setup_per_zone_wmarks(void) | |||
5174 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5258 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5175 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5259 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5176 | 5260 | ||
5177 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | ||
5178 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | ||
5179 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | ||
5180 | |||
5181 | setup_zone_migrate_reserve(zone); | 5261 | setup_zone_migrate_reserve(zone); |
5182 | spin_unlock_irqrestore(&zone->lock, flags); | 5262 | spin_unlock_irqrestore(&zone->lock, flags); |
5183 | } | 5263 | } |
@@ -5505,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | |||
5505 | pfn &= (PAGES_PER_SECTION-1); | 5585 | pfn &= (PAGES_PER_SECTION-1); |
5506 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5586 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5507 | #else | 5587 | #else |
5508 | pfn = pfn - zone->zone_start_pfn; | 5588 | pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); |
5509 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5589 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5510 | #endif /* CONFIG_SPARSEMEM */ | 5590 | #endif /* CONFIG_SPARSEMEM */ |
5511 | } | 5591 | } |
@@ -5575,7 +5655,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5575 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 5655 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't |
5576 | * expect this function should be exact. | 5656 | * expect this function should be exact. |
5577 | */ | 5657 | */ |
5578 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | 5658 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
5659 | bool skip_hwpoisoned_pages) | ||
5579 | { | 5660 | { |
5580 | unsigned long pfn, iter, found; | 5661 | unsigned long pfn, iter, found; |
5581 | int mt; | 5662 | int mt; |
@@ -5610,6 +5691,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |||
5610 | continue; | 5691 | continue; |
5611 | } | 5692 | } |
5612 | 5693 | ||
5694 | /* | ||
5695 | * The HWPoisoned page may be not in buddy system, and | ||
5696 | * page_count() is not 0. | ||
5697 | */ | ||
5698 | if (skip_hwpoisoned_pages && PageHWPoison(page)) | ||
5699 | continue; | ||
5700 | |||
5613 | if (!PageLRU(page)) | 5701 | if (!PageLRU(page)) |
5614 | found++; | 5702 | found++; |
5615 | /* | 5703 | /* |
@@ -5652,7 +5740,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5652 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5740 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5653 | return false; | 5741 | return false; |
5654 | 5742 | ||
5655 | return !has_unmovable_pages(zone, page, 0); | 5743 | return !has_unmovable_pages(zone, page, 0, true); |
5656 | } | 5744 | } |
5657 | 5745 | ||
5658 | #ifdef CONFIG_CMA | 5746 | #ifdef CONFIG_CMA |
@@ -5679,7 +5767,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5679 | unsigned int tries = 0; | 5767 | unsigned int tries = 0; |
5680 | int ret = 0; | 5768 | int ret = 0; |
5681 | 5769 | ||
5682 | migrate_prep_local(); | 5770 | migrate_prep(); |
5683 | 5771 | ||
5684 | while (pfn < end || !list_empty(&cc->migratepages)) { | 5772 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5685 | if (fatal_signal_pending(current)) { | 5773 | if (fatal_signal_pending(current)) { |
@@ -5707,61 +5795,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5707 | 5795 | ||
5708 | ret = migrate_pages(&cc->migratepages, | 5796 | ret = migrate_pages(&cc->migratepages, |
5709 | alloc_migrate_target, | 5797 | alloc_migrate_target, |
5710 | 0, false, MIGRATE_SYNC); | 5798 | 0, false, MIGRATE_SYNC, |
5799 | MR_CMA); | ||
5711 | } | 5800 | } |
5712 | 5801 | ||
5713 | putback_lru_pages(&cc->migratepages); | 5802 | putback_movable_pages(&cc->migratepages); |
5714 | return ret > 0 ? 0 : ret; | 5803 | return ret > 0 ? 0 : ret; |
5715 | } | 5804 | } |
5716 | 5805 | ||
5717 | /* | ||
5718 | * Update zone's cma pages counter used for watermark level calculation. | ||
5719 | */ | ||
5720 | static inline void __update_cma_watermarks(struct zone *zone, int count) | ||
5721 | { | ||
5722 | unsigned long flags; | ||
5723 | spin_lock_irqsave(&zone->lock, flags); | ||
5724 | zone->min_cma_pages += count; | ||
5725 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5726 | setup_per_zone_wmarks(); | ||
5727 | } | ||
5728 | |||
5729 | /* | ||
5730 | * Trigger memory pressure bump to reclaim some pages in order to be able to | ||
5731 | * allocate 'count' pages in single page units. Does similar work as | ||
5732 | *__alloc_pages_slowpath() function. | ||
5733 | */ | ||
5734 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | ||
5735 | { | ||
5736 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
5737 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | ||
5738 | int did_some_progress = 0; | ||
5739 | int order = 1; | ||
5740 | |||
5741 | /* | ||
5742 | * Increase level of watermarks to force kswapd do his job | ||
5743 | * to stabilise at new watermark level. | ||
5744 | */ | ||
5745 | __update_cma_watermarks(zone, count); | ||
5746 | |||
5747 | /* Obey watermarks as if the page was being allocated */ | ||
5748 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | ||
5749 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | ||
5750 | |||
5751 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
5752 | NULL); | ||
5753 | if (!did_some_progress) { | ||
5754 | /* Exhausted what can be done so it's blamo time */ | ||
5755 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | ||
5756 | } | ||
5757 | } | ||
5758 | |||
5759 | /* Restore original watermark levels. */ | ||
5760 | __update_cma_watermarks(zone, -count); | ||
5761 | |||
5762 | return count; | ||
5763 | } | ||
5764 | |||
5765 | /** | 5806 | /** |
5766 | * alloc_contig_range() -- tries to allocate given range of pages | 5807 | * alloc_contig_range() -- tries to allocate given range of pages |
5767 | * @start: start PFN to allocate | 5808 | * @start: start PFN to allocate |
@@ -5785,7 +5826,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | |||
5785 | int alloc_contig_range(unsigned long start, unsigned long end, | 5826 | int alloc_contig_range(unsigned long start, unsigned long end, |
5786 | unsigned migratetype) | 5827 | unsigned migratetype) |
5787 | { | 5828 | { |
5788 | struct zone *zone = page_zone(pfn_to_page(start)); | ||
5789 | unsigned long outer_start, outer_end; | 5829 | unsigned long outer_start, outer_end; |
5790 | int ret = 0, order; | 5830 | int ret = 0, order; |
5791 | 5831 | ||
@@ -5823,7 +5863,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5823 | */ | 5863 | */ |
5824 | 5864 | ||
5825 | ret = start_isolate_page_range(pfn_max_align_down(start), | 5865 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5826 | pfn_max_align_up(end), migratetype); | 5866 | pfn_max_align_up(end), migratetype, |
5867 | false); | ||
5827 | if (ret) | 5868 | if (ret) |
5828 | return ret; | 5869 | return ret; |
5829 | 5870 | ||
@@ -5862,18 +5903,13 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5862 | } | 5903 | } |
5863 | 5904 | ||
5864 | /* Make sure the range is really isolated. */ | 5905 | /* Make sure the range is really isolated. */ |
5865 | if (test_pages_isolated(outer_start, end)) { | 5906 | if (test_pages_isolated(outer_start, end, false)) { |
5866 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | 5907 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", |
5867 | outer_start, end); | 5908 | outer_start, end); |
5868 | ret = -EBUSY; | 5909 | ret = -EBUSY; |
5869 | goto done; | 5910 | goto done; |
5870 | } | 5911 | } |
5871 | 5912 | ||
5872 | /* | ||
5873 | * Reclaim enough pages to make sure that contiguous allocation | ||
5874 | * will not starve the system. | ||
5875 | */ | ||
5876 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | ||
5877 | 5913 | ||
5878 | /* Grab isolated pages from freelists. */ | 5914 | /* Grab isolated pages from freelists. */ |
5879 | outer_end = isolate_freepages_range(&cc, outer_start, end); | 5915 | outer_end = isolate_freepages_range(&cc, outer_start, end); |
@@ -5896,8 +5932,15 @@ done: | |||
5896 | 5932 | ||
5897 | void free_contig_range(unsigned long pfn, unsigned nr_pages) | 5933 | void free_contig_range(unsigned long pfn, unsigned nr_pages) |
5898 | { | 5934 | { |
5899 | for (; nr_pages--; ++pfn) | 5935 | unsigned int count = 0; |
5900 | __free_page(pfn_to_page(pfn)); | 5936 | |
5937 | for (; nr_pages--; pfn++) { | ||
5938 | struct page *page = pfn_to_page(pfn); | ||
5939 | |||
5940 | count += page_count(page) != 1; | ||
5941 | __free_page(page); | ||
5942 | } | ||
5943 | WARN(count != 0, "%d pages are still in use!\n", count); | ||
5901 | } | 5944 | } |
5902 | #endif | 5945 | #endif |
5903 | 5946 | ||
@@ -5931,7 +5974,6 @@ void __meminit zone_pcp_update(struct zone *zone) | |||
5931 | } | 5974 | } |
5932 | #endif | 5975 | #endif |
5933 | 5976 | ||
5934 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5935 | void zone_pcp_reset(struct zone *zone) | 5977 | void zone_pcp_reset(struct zone *zone) |
5936 | { | 5978 | { |
5937 | unsigned long flags; | 5979 | unsigned long flags; |
@@ -5951,6 +5993,7 @@ void zone_pcp_reset(struct zone *zone) | |||
5951 | local_irq_restore(flags); | 5993 | local_irq_restore(flags); |
5952 | } | 5994 | } |
5953 | 5995 | ||
5996 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5954 | /* | 5997 | /* |
5955 | * All pages in the range must be isolated before calling this. | 5998 | * All pages in the range must be isolated before calling this. |
5956 | */ | 5999 | */ |
@@ -5977,6 +6020,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5977 | continue; | 6020 | continue; |
5978 | } | 6021 | } |
5979 | page = pfn_to_page(pfn); | 6022 | page = pfn_to_page(pfn); |
6023 | /* | ||
6024 | * The HWPoisoned page may be not in buddy system, and | ||
6025 | * page_count() is not 0. | ||
6026 | */ | ||
6027 | if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { | ||
6028 | pfn++; | ||
6029 | SetPageReserved(page); | ||
6030 | continue; | ||
6031 | } | ||
6032 | |||
5980 | BUG_ON(page_count(page)); | 6033 | BUG_ON(page_count(page)); |
5981 | BUG_ON(!PageBuddy(page)); | 6034 | BUG_ON(!PageBuddy(page)); |
5982 | order = page_order(page); | 6035 | order = page_order(page); |
@@ -5987,8 +6040,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5987 | list_del(&page->lru); | 6040 | list_del(&page->lru); |
5988 | rmv_page_order(page); | 6041 | rmv_page_order(page); |
5989 | zone->free_area[order].nr_free--; | 6042 | zone->free_area[order].nr_free--; |
5990 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
5991 | - (1UL << order)); | ||
5992 | for (i = 0; i < (1 << order); i++) | 6043 | for (i = 0; i < (1 << order); i++) |
5993 | SetPageReserved((page+i)); | 6044 | SetPageReserved((page+i)); |
5994 | pfn += (1 << order); | 6045 | pfn += (1 << order); |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5ddad0c6daa6..6d757e3a872a 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
251 | mn->nr_pages, mn->status_change_nid); | 251 | mn->nr_pages, mn->status_change_nid); |
252 | break; | 252 | break; |
253 | case MEM_CANCEL_ONLINE: | 253 | case MEM_CANCEL_ONLINE: |
254 | offline_page_cgroup(mn->start_pfn, | ||
255 | mn->nr_pages, mn->status_change_nid); | ||
256 | break; | ||
254 | case MEM_GOING_OFFLINE: | 257 | case MEM_GOING_OFFLINE: |
255 | break; | 258 | break; |
256 | case MEM_ONLINE: | 259 | case MEM_ONLINE: |
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void) | |||
271 | if (mem_cgroup_disabled()) | 274 | if (mem_cgroup_disabled()) |
272 | return; | 275 | return; |
273 | 276 | ||
274 | for_each_node_state(nid, N_HIGH_MEMORY) { | 277 | for_each_node_state(nid, N_MEMORY) { |
275 | unsigned long start_pfn, end_pfn; | 278 | unsigned long start_pfn, end_pfn; |
276 | 279 | ||
277 | start_pfn = node_start_pfn(nid); | 280 | start_pfn = node_start_pfn(nid); |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f2f5b4818e94..383bdbb98b04 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -8,29 +8,7 @@ | |||
8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
9 | #include "internal.h" | 9 | #include "internal.h" |
10 | 10 | ||
11 | /* called while holding zone->lock */ | 11 | int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) |
12 | static void set_pageblock_isolate(struct page *page) | ||
13 | { | ||
14 | if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) | ||
15 | return; | ||
16 | |||
17 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
18 | page_zone(page)->nr_pageblock_isolate++; | ||
19 | } | ||
20 | |||
21 | /* called while holding zone->lock */ | ||
22 | static void restore_pageblock_isolate(struct page *page, int migratetype) | ||
23 | { | ||
24 | struct zone *zone = page_zone(page); | ||
25 | if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) | ||
26 | return; | ||
27 | |||
28 | BUG_ON(zone->nr_pageblock_isolate <= 0); | ||
29 | set_pageblock_migratetype(page, migratetype); | ||
30 | zone->nr_pageblock_isolate--; | ||
31 | } | ||
32 | |||
33 | int set_migratetype_isolate(struct page *page) | ||
34 | { | 12 | { |
35 | struct zone *zone; | 13 | struct zone *zone; |
36 | unsigned long flags, pfn; | 14 | unsigned long flags, pfn; |
@@ -66,7 +44,8 @@ int set_migratetype_isolate(struct page *page) | |||
66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | 44 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
67 | * We just check MOVABLE pages. | 45 | * We just check MOVABLE pages. |
68 | */ | 46 | */ |
69 | if (!has_unmovable_pages(zone, page, arg.pages_found)) | 47 | if (!has_unmovable_pages(zone, page, arg.pages_found, |
48 | skip_hwpoisoned_pages)) | ||
70 | ret = 0; | 49 | ret = 0; |
71 | 50 | ||
72 | /* | 51 | /* |
@@ -79,7 +58,7 @@ out: | |||
79 | unsigned long nr_pages; | 58 | unsigned long nr_pages; |
80 | int migratetype = get_pageblock_migratetype(page); | 59 | int migratetype = get_pageblock_migratetype(page); |
81 | 60 | ||
82 | set_pageblock_isolate(page); | 61 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); |
83 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); | 62 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); |
84 | 63 | ||
85 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); | 64 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); |
@@ -102,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) | |||
102 | goto out; | 81 | goto out; |
103 | nr_pages = move_freepages_block(zone, page, migratetype); | 82 | nr_pages = move_freepages_block(zone, page, migratetype); |
104 | __mod_zone_freepage_state(zone, nr_pages, migratetype); | 83 | __mod_zone_freepage_state(zone, nr_pages, migratetype); |
105 | restore_pageblock_isolate(page, migratetype); | 84 | set_pageblock_migratetype(page, migratetype); |
106 | out: | 85 | out: |
107 | spin_unlock_irqrestore(&zone->lock, flags); | 86 | spin_unlock_irqrestore(&zone->lock, flags); |
108 | } | 87 | } |
@@ -134,7 +113,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
134 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | 113 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. |
135 | */ | 114 | */ |
136 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | 115 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
137 | unsigned migratetype) | 116 | unsigned migratetype, bool skip_hwpoisoned_pages) |
138 | { | 117 | { |
139 | unsigned long pfn; | 118 | unsigned long pfn; |
140 | unsigned long undo_pfn; | 119 | unsigned long undo_pfn; |
@@ -147,7 +126,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
147 | pfn < end_pfn; | 126 | pfn < end_pfn; |
148 | pfn += pageblock_nr_pages) { | 127 | pfn += pageblock_nr_pages) { |
149 | page = __first_valid_page(pfn, pageblock_nr_pages); | 128 | page = __first_valid_page(pfn, pageblock_nr_pages); |
150 | if (page && set_migratetype_isolate(page)) { | 129 | if (page && |
130 | set_migratetype_isolate(page, skip_hwpoisoned_pages)) { | ||
151 | undo_pfn = pfn; | 131 | undo_pfn = pfn; |
152 | goto undo; | 132 | goto undo; |
153 | } | 133 | } |
@@ -190,7 +170,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
190 | * Returns 1 if all pages in the range are isolated. | 170 | * Returns 1 if all pages in the range are isolated. |
191 | */ | 171 | */ |
192 | static int | 172 | static int |
193 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | 173 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, |
174 | bool skip_hwpoisoned_pages) | ||
194 | { | 175 | { |
195 | struct page *page; | 176 | struct page *page; |
196 | 177 | ||
@@ -220,6 +201,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
220 | else if (page_count(page) == 0 && | 201 | else if (page_count(page) == 0 && |
221 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) | 202 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) |
222 | pfn += 1; | 203 | pfn += 1; |
204 | else if (skip_hwpoisoned_pages && PageHWPoison(page)) { | ||
205 | /* | ||
206 | * The HWPoisoned page may be not in buddy | ||
207 | * system, and page_count() is not 0. | ||
208 | */ | ||
209 | pfn++; | ||
210 | continue; | ||
211 | } | ||
223 | else | 212 | else |
224 | break; | 213 | break; |
225 | } | 214 | } |
@@ -228,7 +217,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
228 | return 1; | 217 | return 1; |
229 | } | 218 | } |
230 | 219 | ||
231 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | 220 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, |
221 | bool skip_hwpoisoned_pages) | ||
232 | { | 222 | { |
233 | unsigned long pfn, flags; | 223 | unsigned long pfn, flags; |
234 | struct page *page; | 224 | struct page *page; |
@@ -251,7 +241,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
251 | /* Check all pages are free or Marked as ISOLATED */ | 241 | /* Check all pages are free or Marked as ISOLATED */ |
252 | zone = page_zone(page); | 242 | zone = page_zone(page); |
253 | spin_lock_irqsave(&zone->lock, flags); | 243 | spin_lock_irqsave(&zone->lock, flags); |
254 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); | 244 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, |
245 | skip_hwpoisoned_pages); | ||
255 | spin_unlock_irqrestore(&zone->lock, flags); | 246 | spin_unlock_irqrestore(&zone->lock, flags); |
256 | return ret ? 0 : -EBUSY; | 247 | return ret ? 0 : -EBUSY; |
257 | } | 248 | } |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 6c118d012bb5..35aa294656cd 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -58,7 +58,7 @@ again: | |||
58 | if (!walk->pte_entry) | 58 | if (!walk->pte_entry) |
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); |
62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
diff --git a/mm/percpu.c b/mm/percpu.c index ddc5efb9c5bb..8c8e08f3a692 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) | |||
631 | if (!chunk) | 631 | if (!chunk) |
632 | return; | 632 | return; |
633 | pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); | 633 | pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); |
634 | kfree(chunk); | 634 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); |
635 | } | 635 | } |
636 | 636 | ||
637 | /* | 637 | /* |
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; | |||
1380 | 1380 | ||
1381 | static int __init percpu_alloc_setup(char *str) | 1381 | static int __init percpu_alloc_setup(char *str) |
1382 | { | 1382 | { |
1383 | if (!str) | ||
1384 | return -EINVAL; | ||
1385 | |||
1383 | if (0) | 1386 | if (0) |
1384 | /* nada */; | 1387 | /* nada */; |
1385 | #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK | 1388 | #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b7..0c8323fe6c8f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -12,8 +12,8 @@ | |||
12 | 12 | ||
13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | 13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS |
14 | /* | 14 | /* |
15 | * Only sets the access flags (dirty, accessed, and | 15 | * Only sets the access flags (dirty, accessed), as well as write |
16 | * writable). Furthermore, we know it always gets set to a "more | 16 | * permission. Furthermore, we know it always gets set to a "more |
17 | * permissive" setting, which allows most architectures to optimize | 17 | * permissive" setting, which allows most architectures to optimize |
18 | * this. We return whether the PTE actually changed, which in turn | 18 | * this. We return whether the PTE actually changed, which in turn |
19 | * instructs the caller to do things like update__mmu_cache. This | 19 | * instructs the caller to do things like update__mmu_cache. This |
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | |||
27 | int changed = !pte_same(*ptep, entry); | 27 | int changed = !pte_same(*ptep, entry); |
28 | if (changed) { | 28 | if (changed) { |
29 | set_pte_at(vma->vm_mm, address, ptep, entry); | 29 | set_pte_at(vma->vm_mm, address, ptep, entry); |
30 | flush_tlb_page(vma, address); | 30 | flush_tlb_fix_spurious_fault(vma, address); |
31 | } | 31 | } |
32 | return changed; | 32 | return changed; |
33 | } | 33 | } |
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
88 | { | 88 | { |
89 | pte_t pte; | 89 | pte_t pte; |
90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); | 90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); |
91 | flush_tlb_page(vma, address); | 91 | if (pte_accessible(pte)) |
92 | flush_tlb_page(vma, address); | ||
92 | return pte; | 93 | return pte; |
93 | } | 94 | } |
94 | #endif | 95 | #endif |
@@ -24,7 +24,7 @@ | |||
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
27 | * anon_vma->mutex | 27 | * anon_vma->rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
30 | * swap_lock (in swap_duplicate, swap_info_get) | 30 | * swap_lock (in swap_duplicate, swap_info_get) |
@@ -37,7 +37,7 @@ | |||
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within bdi.wb->list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |
42 | * pte map lock | 42 | * pte map lock |
43 | */ | 43 | */ |
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Synchronize against page_lock_anon_vma() such that | 90 | * Synchronize against page_lock_anon_vma_read() such that |
91 | * we can safely hold the lock without the anon_vma getting | 91 | * we can safely hold the lock without the anon_vma getting |
92 | * freed. | 92 | * freed. |
93 | * | 93 | * |
94 | * Relies on the full mb implied by the atomic_dec_and_test() from | 94 | * Relies on the full mb implied by the atomic_dec_and_test() from |
95 | * put_anon_vma() against the acquire barrier implied by | 95 | * put_anon_vma() against the acquire barrier implied by |
96 | * mutex_trylock() from page_lock_anon_vma(). This orders: | 96 | * down_read_trylock() from page_lock_anon_vma_read(). This orders: |
97 | * | 97 | * |
98 | * page_lock_anon_vma() VS put_anon_vma() | 98 | * page_lock_anon_vma_read() VS put_anon_vma() |
99 | * mutex_trylock() atomic_dec_and_test() | 99 | * down_read_trylock() atomic_dec_and_test() |
100 | * LOCK MB | 100 | * LOCK MB |
101 | * atomic_read() mutex_is_locked() | 101 | * atomic_read() rwsem_is_locked() |
102 | * | 102 | * |
103 | * LOCK should suffice since the actual taking of the lock must | 103 | * LOCK should suffice since the actual taking of the lock must |
104 | * happen _before_ what follows. | 104 | * happen _before_ what follows. |
105 | */ | 105 | */ |
106 | if (mutex_is_locked(&anon_vma->root->mutex)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock(anon_vma); |
109 | } | 109 | } |
110 | 110 | ||
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
146 | * allocate a new one. | 146 | * allocate a new one. |
147 | * | 147 | * |
148 | * Anon-vma allocations are very subtle, because we may have | 148 | * Anon-vma allocations are very subtle, because we may have |
149 | * optimistically looked up an anon_vma in page_lock_anon_vma() | 149 | * optimistically looked up an anon_vma in page_lock_anon_vma_read() |
150 | * and that may actually touch the spinlock even in the newly | 150 | * and that may actually touch the spinlock even in the newly |
151 | * allocated vma (it depends on RCU to make sure that the | 151 | * allocated vma (it depends on RCU to make sure that the |
152 | * anon_vma isn't actually destroyed). | 152 | * anon_vma isn't actually destroyed). |
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
181 | allocated = anon_vma; | 181 | allocated = anon_vma; |
182 | } | 182 | } |
183 | 183 | ||
184 | anon_vma_lock(anon_vma); | 184 | anon_vma_lock_write(anon_vma); |
185 | /* page_table_lock to protect against threads */ | 185 | /* page_table_lock to protect against threads */ |
186 | spin_lock(&mm->page_table_lock); | 186 | spin_lock(&mm->page_table_lock); |
187 | if (likely(!vma->anon_vma)) { | 187 | if (likely(!vma->anon_vma)) { |
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
219 | struct anon_vma *new_root = anon_vma->root; | 219 | struct anon_vma *new_root = anon_vma->root; |
220 | if (new_root != root) { | 220 | if (new_root != root) { |
221 | if (WARN_ON_ONCE(root)) | 221 | if (WARN_ON_ONCE(root)) |
222 | mutex_unlock(&root->mutex); | 222 | up_write(&root->rwsem); |
223 | root = new_root; | 223 | root = new_root; |
224 | mutex_lock(&root->mutex); | 224 | down_write(&root->rwsem); |
225 | } | 225 | } |
226 | return root; | 226 | return root; |
227 | } | 227 | } |
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
229 | static inline void unlock_anon_vma_root(struct anon_vma *root) | 229 | static inline void unlock_anon_vma_root(struct anon_vma *root) |
230 | { | 230 | { |
231 | if (root) | 231 | if (root) |
232 | mutex_unlock(&root->mutex); | 232 | up_write(&root->rwsem); |
233 | } | 233 | } |
234 | 234 | ||
235 | /* | 235 | /* |
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
306 | get_anon_vma(anon_vma->root); | 306 | get_anon_vma(anon_vma->root); |
307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
309 | anon_vma_lock(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock(anon_vma); |
312 | 312 | ||
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
349 | /* | 349 | /* |
350 | * Iterate the list once more, it now only contains empty and unlinked | 350 | * Iterate the list once more, it now only contains empty and unlinked |
351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | 351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() |
352 | * needing to acquire the anon_vma->root->mutex. | 352 | * needing to write-acquire the anon_vma->root->rwsem. |
353 | */ | 353 | */ |
354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
355 | struct anon_vma *anon_vma = avc->anon_vma; | 355 | struct anon_vma *anon_vma = avc->anon_vma; |
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) | |||
365 | { | 365 | { |
366 | struct anon_vma *anon_vma = data; | 366 | struct anon_vma *anon_vma = data; |
367 | 367 | ||
368 | mutex_init(&anon_vma->mutex); | 368 | init_rwsem(&anon_vma->rwsem); |
369 | atomic_set(&anon_vma->refcount, 0); | 369 | atomic_set(&anon_vma->refcount, 0); |
370 | anon_vma->rb_root = RB_ROOT; | 370 | anon_vma->rb_root = RB_ROOT; |
371 | } | 371 | } |
@@ -442,7 +442,7 @@ out: | |||
442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | 442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a |
443 | * reference like with page_get_anon_vma() and then block on the mutex. | 443 | * reference like with page_get_anon_vma() and then block on the mutex. |
444 | */ | 444 | */ |
445 | struct anon_vma *page_lock_anon_vma(struct page *page) | 445 | struct anon_vma *page_lock_anon_vma_read(struct page *page) |
446 | { | 446 | { |
447 | struct anon_vma *anon_vma = NULL; | 447 | struct anon_vma *anon_vma = NULL; |
448 | struct anon_vma *root_anon_vma; | 448 | struct anon_vma *root_anon_vma; |
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
457 | 457 | ||
458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); |
460 | if (mutex_trylock(&root_anon_vma->mutex)) { | 460 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
461 | /* | 461 | /* |
462 | * If the page is still mapped, then this anon_vma is still | 462 | * If the page is still mapped, then this anon_vma is still |
463 | * its anon_vma, and holding the mutex ensures that it will | 463 | * its anon_vma, and holding the mutex ensures that it will |
464 | * not go away, see anon_vma_free(). | 464 | * not go away, see anon_vma_free(). |
465 | */ | 465 | */ |
466 | if (!page_mapped(page)) { | 466 | if (!page_mapped(page)) { |
467 | mutex_unlock(&root_anon_vma->mutex); | 467 | up_read(&root_anon_vma->rwsem); |
468 | anon_vma = NULL; | 468 | anon_vma = NULL; |
469 | } | 469 | } |
470 | goto out; | 470 | goto out; |
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
484 | 484 | ||
485 | /* we pinned the anon_vma, its safe to sleep */ | 485 | /* we pinned the anon_vma, its safe to sleep */ |
486 | rcu_read_unlock(); | 486 | rcu_read_unlock(); |
487 | anon_vma_lock(anon_vma); | 487 | anon_vma_lock_read(anon_vma); |
488 | 488 | ||
489 | if (atomic_dec_and_test(&anon_vma->refcount)) { | 489 | if (atomic_dec_and_test(&anon_vma->refcount)) { |
490 | /* | 490 | /* |
491 | * Oops, we held the last refcount, release the lock | 491 | * Oops, we held the last refcount, release the lock |
492 | * and bail -- can't simply use put_anon_vma() because | 492 | * and bail -- can't simply use put_anon_vma() because |
493 | * we'll deadlock on the anon_vma_lock() recursion. | 493 | * we'll deadlock on the anon_vma_lock_write() recursion. |
494 | */ | 494 | */ |
495 | anon_vma_unlock(anon_vma); | 495 | anon_vma_unlock_read(anon_vma); |
496 | __put_anon_vma(anon_vma); | 496 | __put_anon_vma(anon_vma); |
497 | anon_vma = NULL; | 497 | anon_vma = NULL; |
498 | } | 498 | } |
@@ -504,9 +504,9 @@ out: | |||
504 | return anon_vma; | 504 | return anon_vma; |
505 | } | 505 | } |
506 | 506 | ||
507 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 507 | void page_unlock_anon_vma_read(struct anon_vma *anon_vma) |
508 | { | 508 | { |
509 | anon_vma_unlock(anon_vma); | 509 | anon_vma_unlock_read(anon_vma); |
510 | } | 510 | } |
511 | 511 | ||
512 | /* | 512 | /* |
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
562 | return address; | 562 | return address; |
563 | } | 563 | } |
564 | 564 | ||
565 | pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) | ||
566 | { | ||
567 | pgd_t *pgd; | ||
568 | pud_t *pud; | ||
569 | pmd_t *pmd = NULL; | ||
570 | |||
571 | pgd = pgd_offset(mm, address); | ||
572 | if (!pgd_present(*pgd)) | ||
573 | goto out; | ||
574 | |||
575 | pud = pud_offset(pgd, address); | ||
576 | if (!pud_present(*pud)) | ||
577 | goto out; | ||
578 | |||
579 | pmd = pmd_offset(pud, address); | ||
580 | if (!pmd_present(*pmd)) | ||
581 | pmd = NULL; | ||
582 | out: | ||
583 | return pmd; | ||
584 | } | ||
585 | |||
565 | /* | 586 | /* |
566 | * Check that @page is mapped at @address into @mm. | 587 | * Check that @page is mapped at @address into @mm. |
567 | * | 588 | * |
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
574 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | 595 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, |
575 | unsigned long address, spinlock_t **ptlp, int sync) | 596 | unsigned long address, spinlock_t **ptlp, int sync) |
576 | { | 597 | { |
577 | pgd_t *pgd; | ||
578 | pud_t *pud; | ||
579 | pmd_t *pmd; | 598 | pmd_t *pmd; |
580 | pte_t *pte; | 599 | pte_t *pte; |
581 | spinlock_t *ptl; | 600 | spinlock_t *ptl; |
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
586 | goto check; | 605 | goto check; |
587 | } | 606 | } |
588 | 607 | ||
589 | pgd = pgd_offset(mm, address); | 608 | pmd = mm_find_pmd(mm, address); |
590 | if (!pgd_present(*pgd)) | 609 | if (!pmd) |
591 | return NULL; | ||
592 | |||
593 | pud = pud_offset(pgd, address); | ||
594 | if (!pud_present(*pud)) | ||
595 | return NULL; | 610 | return NULL; |
596 | 611 | ||
597 | pmd = pmd_offset(pud, address); | ||
598 | if (!pmd_present(*pmd)) | ||
599 | return NULL; | ||
600 | if (pmd_trans_huge(*pmd)) | 612 | if (pmd_trans_huge(*pmd)) |
601 | return NULL; | 613 | return NULL; |
602 | 614 | ||
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page, | |||
732 | struct anon_vma_chain *avc; | 744 | struct anon_vma_chain *avc; |
733 | int referenced = 0; | 745 | int referenced = 0; |
734 | 746 | ||
735 | anon_vma = page_lock_anon_vma(page); | 747 | anon_vma = page_lock_anon_vma_read(page); |
736 | if (!anon_vma) | 748 | if (!anon_vma) |
737 | return referenced; | 749 | return referenced; |
738 | 750 | ||
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page, | |||
754 | break; | 766 | break; |
755 | } | 767 | } |
756 | 768 | ||
757 | page_unlock_anon_vma(anon_vma); | 769 | page_unlock_anon_vma_read(anon_vma); |
758 | return referenced; | 770 | return referenced; |
759 | } | 771 | } |
760 | 772 | ||
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page) | |||
1139 | * containing the swap entry, but page not yet written to swap. | 1151 | * containing the swap entry, but page not yet written to swap. |
1140 | * | 1152 | * |
1141 | * And we can skip it on file pages, so long as the filesystem | 1153 | * And we can skip it on file pages, so long as the filesystem |
1142 | * participates in dirty tracking; but need to catch shm and tmpfs | 1154 | * participates in dirty tracking (note that this is not only an |
1143 | * and ramfs pages which have been modified since creation by read | 1155 | * optimization but also solves problems caused by dirty flag in |
1144 | * fault. | 1156 | * storage key getting set by a write from inside kernel); but need to |
1157 | * catch shm and tmpfs and ramfs pages which have been modified since | ||
1158 | * creation by read fault. | ||
1145 | * | 1159 | * |
1146 | * Note that mapping must be decided above, before decrementing | 1160 | * Note that mapping must be decided above, before decrementing |
1147 | * mapcount (which luckily provides a barrier): once page is unmapped, | 1161 | * mapcount (which luckily provides a barrier): once page is unmapped, |
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1235 | update_hiwater_rss(mm); | 1249 | update_hiwater_rss(mm); |
1236 | 1250 | ||
1237 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 1251 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
1238 | if (PageAnon(page)) | 1252 | if (!PageHuge(page)) { |
1239 | dec_mm_counter(mm, MM_ANONPAGES); | 1253 | if (PageAnon(page)) |
1240 | else | 1254 | dec_mm_counter(mm, MM_ANONPAGES); |
1241 | dec_mm_counter(mm, MM_FILEPAGES); | 1255 | else |
1256 | dec_mm_counter(mm, MM_FILEPAGES); | ||
1257 | } | ||
1242 | set_pte_at(mm, address, pte, | 1258 | set_pte_at(mm, address, pte, |
1243 | swp_entry_to_pte(make_hwpoison_entry(page))); | 1259 | swp_entry_to_pte(make_hwpoison_entry(page))); |
1244 | } else if (PageAnon(page)) { | 1260 | } else if (PageAnon(page)) { |
1245 | swp_entry_t entry = { .val = page_private(page) }; | 1261 | swp_entry_t entry = { .val = page_private(page) }; |
1246 | 1262 | ||
@@ -1299,7 +1315,7 @@ out_mlock: | |||
1299 | /* | 1315 | /* |
1300 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1316 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1301 | * unstable result and race. Plus, We can't wait here because | 1317 | * unstable result and race. Plus, We can't wait here because |
1302 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. | 1318 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. |
1303 | * if trylock failed, the page remain in evictable lru and later | 1319 | * if trylock failed, the page remain in evictable lru and later |
1304 | * vmscan could retry to move the page to unevictable lru if the | 1320 | * vmscan could retry to move the page to unevictable lru if the |
1305 | * page is actually mlocked. | 1321 | * page is actually mlocked. |
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1345 | struct vm_area_struct *vma, struct page *check_page) | 1361 | struct vm_area_struct *vma, struct page *check_page) |
1346 | { | 1362 | { |
1347 | struct mm_struct *mm = vma->vm_mm; | 1363 | struct mm_struct *mm = vma->vm_mm; |
1348 | pgd_t *pgd; | ||
1349 | pud_t *pud; | ||
1350 | pmd_t *pmd; | 1364 | pmd_t *pmd; |
1351 | pte_t *pte; | 1365 | pte_t *pte; |
1352 | pte_t pteval; | 1366 | pte_t pteval; |
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1366 | if (end > vma->vm_end) | 1380 | if (end > vma->vm_end) |
1367 | end = vma->vm_end; | 1381 | end = vma->vm_end; |
1368 | 1382 | ||
1369 | pgd = pgd_offset(mm, address); | 1383 | pmd = mm_find_pmd(mm, address); |
1370 | if (!pgd_present(*pgd)) | 1384 | if (!pmd) |
1371 | return ret; | ||
1372 | |||
1373 | pud = pud_offset(pgd, address); | ||
1374 | if (!pud_present(*pud)) | ||
1375 | return ret; | ||
1376 | |||
1377 | pmd = pmd_offset(pud, address); | ||
1378 | if (!pmd_present(*pmd)) | ||
1379 | return ret; | 1385 | return ret; |
1380 | 1386 | ||
1381 | mmun_start = address; | 1387 | mmun_start = address; |
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1474 | struct anon_vma_chain *avc; | 1480 | struct anon_vma_chain *avc; |
1475 | int ret = SWAP_AGAIN; | 1481 | int ret = SWAP_AGAIN; |
1476 | 1482 | ||
1477 | anon_vma = page_lock_anon_vma(page); | 1483 | anon_vma = page_lock_anon_vma_read(page); |
1478 | if (!anon_vma) | 1484 | if (!anon_vma) |
1479 | return ret; | 1485 | return ret; |
1480 | 1486 | ||
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1501 | break; | 1507 | break; |
1502 | } | 1508 | } |
1503 | 1509 | ||
1504 | page_unlock_anon_vma(anon_vma); | 1510 | page_unlock_anon_vma_read(anon_vma); |
1505 | return ret; | 1511 | return ret; |
1506 | } | 1512 | } |
1507 | 1513 | ||
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1696 | int ret = SWAP_AGAIN; | 1702 | int ret = SWAP_AGAIN; |
1697 | 1703 | ||
1698 | /* | 1704 | /* |
1699 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | 1705 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
1700 | * because that depends on page_mapped(); but not all its usages | 1706 | * because that depends on page_mapped(); but not all its usages |
1701 | * are holding mmap_sem. Users without mmap_sem are required to | 1707 | * are holding mmap_sem. Users without mmap_sem are required to |
1702 | * take a reference count to prevent the anon_vma disappearing | 1708 | * take a reference count to prevent the anon_vma disappearing |
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1704 | anon_vma = page_anon_vma(page); | 1710 | anon_vma = page_anon_vma(page); |
1705 | if (!anon_vma) | 1711 | if (!anon_vma) |
1706 | return ret; | 1712 | return ret; |
1707 | anon_vma_lock(anon_vma); | 1713 | anon_vma_lock_read(anon_vma); |
1708 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1714 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1709 | struct vm_area_struct *vma = avc->vma; | 1715 | struct vm_area_struct *vma = avc->vma; |
1710 | unsigned long address = vma_address(page, vma); | 1716 | unsigned long address = vma_address(page, vma); |
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1712 | if (ret != SWAP_AGAIN) | 1718 | if (ret != SWAP_AGAIN) |
1713 | break; | 1719 | break; |
1714 | } | 1720 | } |
1715 | anon_vma_unlock(anon_vma); | 1721 | anon_vma_unlock_read(anon_vma); |
1716 | return ret; | 1722 | return ret; |
1717 | } | 1723 | } |
1718 | 1724 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 89341b658bd0..5dd56f6efdbd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) | |||
889 | if (!mpol || mpol->mode == MPOL_DEFAULT) | 889 | if (!mpol || mpol->mode == MPOL_DEFAULT) |
890 | return; /* show nothing */ | 890 | return; /* show nothing */ |
891 | 891 | ||
892 | mpol_to_str(buffer, sizeof(buffer), mpol, 1); | 892 | mpol_to_str(buffer, sizeof(buffer), mpol); |
893 | 893 | ||
894 | seq_printf(seq, ",mpol=%s", buffer); | 894 | seq_printf(seq, ",mpol=%s", buffer); |
895 | } | 895 | } |
@@ -910,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
910 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, | 910 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
911 | struct shmem_inode_info *info, pgoff_t index) | 911 | struct shmem_inode_info *info, pgoff_t index) |
912 | { | 912 | { |
913 | struct mempolicy mpol, *spol; | ||
914 | struct vm_area_struct pvma; | 913 | struct vm_area_struct pvma; |
915 | 914 | struct page *page; | |
916 | spol = mpol_cond_copy(&mpol, | ||
917 | mpol_shared_policy_lookup(&info->policy, index)); | ||
918 | 915 | ||
919 | /* Create a pseudo vma that just contains the policy */ | 916 | /* Create a pseudo vma that just contains the policy */ |
920 | pvma.vm_start = 0; | 917 | pvma.vm_start = 0; |
921 | /* Bias interleave by inode number to distribute better across nodes */ | 918 | /* Bias interleave by inode number to distribute better across nodes */ |
922 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | 919 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; |
923 | pvma.vm_ops = NULL; | 920 | pvma.vm_ops = NULL; |
924 | pvma.vm_policy = spol; | 921 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
925 | return swapin_readahead(swap, gfp, &pvma, 0); | 922 | |
923 | page = swapin_readahead(swap, gfp, &pvma, 0); | ||
924 | |||
925 | /* Drop reference taken by mpol_shared_policy_lookup() */ | ||
926 | mpol_cond_put(pvma.vm_policy); | ||
927 | |||
928 | return page; | ||
926 | } | 929 | } |
927 | 930 | ||
928 | static struct page *shmem_alloc_page(gfp_t gfp, | 931 | static struct page *shmem_alloc_page(gfp_t gfp, |
929 | struct shmem_inode_info *info, pgoff_t index) | 932 | struct shmem_inode_info *info, pgoff_t index) |
930 | { | 933 | { |
931 | struct vm_area_struct pvma; | 934 | struct vm_area_struct pvma; |
935 | struct page *page; | ||
932 | 936 | ||
933 | /* Create a pseudo vma that just contains the policy */ | 937 | /* Create a pseudo vma that just contains the policy */ |
934 | pvma.vm_start = 0; | 938 | pvma.vm_start = 0; |
@@ -937,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
937 | pvma.vm_ops = NULL; | 941 | pvma.vm_ops = NULL; |
938 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | 942 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
939 | 943 | ||
940 | /* | 944 | page = alloc_page_vma(gfp, &pvma, 0); |
941 | * alloc_page_vma() will drop the shared policy reference | 945 | |
942 | */ | 946 | /* Drop reference taken by mpol_shared_policy_lookup() */ |
943 | return alloc_page_vma(gfp, &pvma, 0); | 947 | mpol_cond_put(pvma.vm_policy); |
948 | |||
949 | return page; | ||
944 | } | 950 | } |
945 | #else /* !CONFIG_NUMA */ | 951 | #else /* !CONFIG_NUMA */ |
946 | #ifdef CONFIG_TMPFS | 952 | #ifdef CONFIG_TMPFS |
@@ -1709,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1709 | return error; | 1715 | return error; |
1710 | } | 1716 | } |
1711 | 1717 | ||
1718 | /* | ||
1719 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
1720 | */ | ||
1721 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
1722 | pgoff_t index, pgoff_t end, int whence) | ||
1723 | { | ||
1724 | struct page *page; | ||
1725 | struct pagevec pvec; | ||
1726 | pgoff_t indices[PAGEVEC_SIZE]; | ||
1727 | bool done = false; | ||
1728 | int i; | ||
1729 | |||
1730 | pagevec_init(&pvec, 0); | ||
1731 | pvec.nr = 1; /* start small: we may be there already */ | ||
1732 | while (!done) { | ||
1733 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
1734 | pvec.nr, pvec.pages, indices); | ||
1735 | if (!pvec.nr) { | ||
1736 | if (whence == SEEK_DATA) | ||
1737 | index = end; | ||
1738 | break; | ||
1739 | } | ||
1740 | for (i = 0; i < pvec.nr; i++, index++) { | ||
1741 | if (index < indices[i]) { | ||
1742 | if (whence == SEEK_HOLE) { | ||
1743 | done = true; | ||
1744 | break; | ||
1745 | } | ||
1746 | index = indices[i]; | ||
1747 | } | ||
1748 | page = pvec.pages[i]; | ||
1749 | if (page && !radix_tree_exceptional_entry(page)) { | ||
1750 | if (!PageUptodate(page)) | ||
1751 | page = NULL; | ||
1752 | } | ||
1753 | if (index >= end || | ||
1754 | (page && whence == SEEK_DATA) || | ||
1755 | (!page && whence == SEEK_HOLE)) { | ||
1756 | done = true; | ||
1757 | break; | ||
1758 | } | ||
1759 | } | ||
1760 | shmem_deswap_pagevec(&pvec); | ||
1761 | pagevec_release(&pvec); | ||
1762 | pvec.nr = PAGEVEC_SIZE; | ||
1763 | cond_resched(); | ||
1764 | } | ||
1765 | return index; | ||
1766 | } | ||
1767 | |||
1768 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) | ||
1769 | { | ||
1770 | struct address_space *mapping = file->f_mapping; | ||
1771 | struct inode *inode = mapping->host; | ||
1772 | pgoff_t start, end; | ||
1773 | loff_t new_offset; | ||
1774 | |||
1775 | if (whence != SEEK_DATA && whence != SEEK_HOLE) | ||
1776 | return generic_file_llseek_size(file, offset, whence, | ||
1777 | MAX_LFS_FILESIZE, i_size_read(inode)); | ||
1778 | mutex_lock(&inode->i_mutex); | ||
1779 | /* We're holding i_mutex so we can access i_size directly */ | ||
1780 | |||
1781 | if (offset < 0) | ||
1782 | offset = -EINVAL; | ||
1783 | else if (offset >= inode->i_size) | ||
1784 | offset = -ENXIO; | ||
1785 | else { | ||
1786 | start = offset >> PAGE_CACHE_SHIFT; | ||
1787 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1788 | new_offset = shmem_seek_hole_data(mapping, start, end, whence); | ||
1789 | new_offset <<= PAGE_CACHE_SHIFT; | ||
1790 | if (new_offset > offset) { | ||
1791 | if (new_offset < inode->i_size) | ||
1792 | offset = new_offset; | ||
1793 | else if (whence == SEEK_DATA) | ||
1794 | offset = -ENXIO; | ||
1795 | else | ||
1796 | offset = inode->i_size; | ||
1797 | } | ||
1798 | } | ||
1799 | |||
1800 | if (offset >= 0 && offset != file->f_pos) { | ||
1801 | file->f_pos = offset; | ||
1802 | file->f_version = 0; | ||
1803 | } | ||
1804 | mutex_unlock(&inode->i_mutex); | ||
1805 | return offset; | ||
1806 | } | ||
1807 | |||
1712 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1808 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
1713 | loff_t len) | 1809 | loff_t len) |
1714 | { | 1810 | { |
@@ -2367,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2367 | if (!gid_valid(sbinfo->gid)) | 2463 | if (!gid_valid(sbinfo->gid)) |
2368 | goto bad_val; | 2464 | goto bad_val; |
2369 | } else if (!strcmp(this_char,"mpol")) { | 2465 | } else if (!strcmp(this_char,"mpol")) { |
2370 | if (mpol_parse_str(value, &sbinfo->mpol, 1)) | 2466 | if (mpol_parse_str(value, &sbinfo->mpol)) |
2371 | goto bad_val; | 2467 | goto bad_val; |
2372 | } else { | 2468 | } else { |
2373 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", | 2469 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", |
@@ -2580,7 +2676,7 @@ static const struct address_space_operations shmem_aops = { | |||
2580 | static const struct file_operations shmem_file_operations = { | 2676 | static const struct file_operations shmem_file_operations = { |
2581 | .mmap = shmem_mmap, | 2677 | .mmap = shmem_mmap, |
2582 | #ifdef CONFIG_TMPFS | 2678 | #ifdef CONFIG_TMPFS |
2583 | .llseek = generic_file_llseek, | 2679 | .llseek = shmem_file_llseek, |
2584 | .read = do_sync_read, | 2680 | .read = do_sync_read, |
2585 | .write = do_sync_write, | 2681 | .write = do_sync_write, |
2586 | .aio_read = shmem_file_aio_read, | 2682 | .aio_read = shmem_file_aio_read, |
@@ -87,7 +87,6 @@ | |||
87 | */ | 87 | */ |
88 | 88 | ||
89 | #include <linux/slab.h> | 89 | #include <linux/slab.h> |
90 | #include "slab.h" | ||
91 | #include <linux/mm.h> | 90 | #include <linux/mm.h> |
92 | #include <linux/poison.h> | 91 | #include <linux/poison.h> |
93 | #include <linux/swap.h> | 92 | #include <linux/swap.h> |
@@ -128,6 +127,8 @@ | |||
128 | 127 | ||
129 | #include "internal.h" | 128 | #include "internal.h" |
130 | 129 | ||
130 | #include "slab.h" | ||
131 | |||
131 | /* | 132 | /* |
132 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. | 133 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. |
133 | * 0 for faster, smaller code (especially in the critical paths). | 134 | * 0 for faster, smaller code (especially in the critical paths). |
@@ -162,23 +163,6 @@ | |||
162 | */ | 163 | */ |
163 | static bool pfmemalloc_active __read_mostly; | 164 | static bool pfmemalloc_active __read_mostly; |
164 | 165 | ||
165 | /* Legal flag mask for kmem_cache_create(). */ | ||
166 | #if DEBUG | ||
167 | # define CREATE_MASK (SLAB_RED_ZONE | \ | ||
168 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ | ||
169 | SLAB_CACHE_DMA | \ | ||
170 | SLAB_STORE_USER | \ | ||
171 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | ||
172 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | ||
173 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) | ||
174 | #else | ||
175 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | ||
176 | SLAB_CACHE_DMA | \ | ||
177 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | ||
178 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | ||
179 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) | ||
180 | #endif | ||
181 | |||
182 | /* | 166 | /* |
183 | * kmem_bufctl_t: | 167 | * kmem_bufctl_t: |
184 | * | 168 | * |
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = { | |||
564 | #undef CACHE | 548 | #undef CACHE |
565 | }; | 549 | }; |
566 | 550 | ||
567 | static struct arraycache_init initarray_cache __initdata = | ||
568 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | ||
569 | static struct arraycache_init initarray_generic = | 551 | static struct arraycache_init initarray_generic = |
570 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 552 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
571 | 553 | ||
572 | /* internal cache of cache description objs */ | 554 | /* internal cache of cache description objs */ |
573 | static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; | ||
574 | static struct kmem_cache kmem_cache_boot = { | 555 | static struct kmem_cache kmem_cache_boot = { |
575 | .nodelists = kmem_cache_nodelists, | ||
576 | .batchcount = 1, | 556 | .batchcount = 1, |
577 | .limit = BOOT_CPUCACHE_ENTRIES, | 557 | .limit = BOOT_CPUCACHE_ENTRIES, |
578 | .shared = 1, | 558 | .shared = 1, |
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q) | |||
662 | } | 642 | } |
663 | } | 643 | } |
664 | 644 | ||
645 | static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) | ||
646 | { | ||
647 | struct kmem_list3 *l3; | ||
648 | l3 = cachep->nodelists[q]; | ||
649 | if (!l3) | ||
650 | return; | ||
651 | |||
652 | slab_set_lock_classes(cachep, &on_slab_l3_key, | ||
653 | &on_slab_alc_key, q); | ||
654 | } | ||
655 | |||
656 | static inline void on_slab_lock_classes(struct kmem_cache *cachep) | ||
657 | { | ||
658 | int node; | ||
659 | |||
660 | VM_BUG_ON(OFF_SLAB(cachep)); | ||
661 | for_each_node(node) | ||
662 | on_slab_lock_classes_node(cachep, node); | ||
663 | } | ||
664 | |||
665 | static inline void init_lock_keys(void) | 665 | static inline void init_lock_keys(void) |
666 | { | 666 | { |
667 | int node; | 667 | int node; |
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void) | |||
678 | { | 678 | { |
679 | } | 679 | } |
680 | 680 | ||
681 | static inline void on_slab_lock_classes(struct kmem_cache *cachep) | ||
682 | { | ||
683 | } | ||
684 | |||
685 | static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) | ||
686 | { | ||
687 | } | ||
688 | |||
681 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | 689 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) |
682 | { | 690 | { |
683 | } | 691 | } |
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1406 | free_alien_cache(alien); | 1414 | free_alien_cache(alien); |
1407 | if (cachep->flags & SLAB_DEBUG_OBJECTS) | 1415 | if (cachep->flags & SLAB_DEBUG_OBJECTS) |
1408 | slab_set_debugobj_lock_classes_node(cachep, node); | 1416 | slab_set_debugobj_lock_classes_node(cachep, node); |
1417 | else if (!OFF_SLAB(cachep) && | ||
1418 | !(cachep->flags & SLAB_DESTROY_BY_RCU)) | ||
1419 | on_slab_lock_classes_node(cachep, node); | ||
1409 | } | 1420 | } |
1410 | init_node_lock_keys(node); | 1421 | init_node_lock_keys(node); |
1411 | 1422 | ||
@@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) | |||
1577 | } | 1588 | } |
1578 | 1589 | ||
1579 | /* | 1590 | /* |
1591 | * The memory after the last cpu cache pointer is used for the | ||
1592 | * the nodelists pointer. | ||
1593 | */ | ||
1594 | static void setup_nodelists_pointer(struct kmem_cache *cachep) | ||
1595 | { | ||
1596 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | ||
1597 | } | ||
1598 | |||
1599 | /* | ||
1580 | * Initialisation. Called after the page allocator have been initialised and | 1600 | * Initialisation. Called after the page allocator have been initialised and |
1581 | * before smp_init(). | 1601 | * before smp_init(). |
1582 | */ | 1602 | */ |
1583 | void __init kmem_cache_init(void) | 1603 | void __init kmem_cache_init(void) |
1584 | { | 1604 | { |
1585 | size_t left_over; | ||
1586 | struct cache_sizes *sizes; | 1605 | struct cache_sizes *sizes; |
1587 | struct cache_names *names; | 1606 | struct cache_names *names; |
1588 | int i; | 1607 | int i; |
1589 | int order; | ||
1590 | int node; | ||
1591 | 1608 | ||
1592 | kmem_cache = &kmem_cache_boot; | 1609 | kmem_cache = &kmem_cache_boot; |
1610 | setup_nodelists_pointer(kmem_cache); | ||
1593 | 1611 | ||
1594 | if (num_possible_nodes() == 1) | 1612 | if (num_possible_nodes() == 1) |
1595 | use_alien_caches = 0; | 1613 | use_alien_caches = 0; |
1596 | 1614 | ||
1597 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1615 | for (i = 0; i < NUM_INIT_LISTS; i++) |
1598 | kmem_list3_init(&initkmem_list3[i]); | 1616 | kmem_list3_init(&initkmem_list3[i]); |
1599 | if (i < MAX_NUMNODES) | 1617 | |
1600 | kmem_cache->nodelists[i] = NULL; | ||
1601 | } | ||
1602 | set_up_list3s(kmem_cache, CACHE_CACHE); | 1618 | set_up_list3s(kmem_cache, CACHE_CACHE); |
1603 | 1619 | ||
1604 | /* | 1620 | /* |
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void) | |||
1629 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. | 1645 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. |
1630 | */ | 1646 | */ |
1631 | 1647 | ||
1632 | node = numa_mem_id(); | ||
1633 | |||
1634 | /* 1) create the kmem_cache */ | 1648 | /* 1) create the kmem_cache */ |
1635 | INIT_LIST_HEAD(&slab_caches); | ||
1636 | list_add(&kmem_cache->list, &slab_caches); | ||
1637 | kmem_cache->colour_off = cache_line_size(); | ||
1638 | kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; | ||
1639 | kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | ||
1640 | 1649 | ||
1641 | /* | 1650 | /* |
1642 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1651 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1643 | */ | 1652 | */ |
1644 | kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1653 | create_boot_cache(kmem_cache, "kmem_cache", |
1645 | nr_node_ids * sizeof(struct kmem_list3 *); | 1654 | offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1646 | kmem_cache->object_size = kmem_cache->size; | 1655 | nr_node_ids * sizeof(struct kmem_list3 *), |
1647 | kmem_cache->size = ALIGN(kmem_cache->object_size, | 1656 | SLAB_HWCACHE_ALIGN); |
1648 | cache_line_size()); | 1657 | list_add(&kmem_cache->list, &slab_caches); |
1649 | kmem_cache->reciprocal_buffer_size = | ||
1650 | reciprocal_value(kmem_cache->size); | ||
1651 | |||
1652 | for (order = 0; order < MAX_ORDER; order++) { | ||
1653 | cache_estimate(order, kmem_cache->size, | ||
1654 | cache_line_size(), 0, &left_over, &kmem_cache->num); | ||
1655 | if (kmem_cache->num) | ||
1656 | break; | ||
1657 | } | ||
1658 | BUG_ON(!kmem_cache->num); | ||
1659 | kmem_cache->gfporder = order; | ||
1660 | kmem_cache->colour = left_over / kmem_cache->colour_off; | ||
1661 | kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + | ||
1662 | sizeof(struct slab), cache_line_size()); | ||
1663 | 1658 | ||
1664 | /* 2+3) create the kmalloc caches */ | 1659 | /* 2+3) create the kmalloc caches */ |
1665 | sizes = malloc_sizes; | 1660 | sizes = malloc_sizes; |
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void) | |||
1671 | * bug. | 1666 | * bug. |
1672 | */ | 1667 | */ |
1673 | 1668 | ||
1674 | sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 1669 | sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, |
1675 | sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; | 1670 | sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); |
1676 | sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; | 1671 | |
1677 | sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; | 1672 | if (INDEX_AC != INDEX_L3) |
1678 | sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; | 1673 | sizes[INDEX_L3].cs_cachep = |
1679 | __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | 1674 | create_kmalloc_cache(names[INDEX_L3].name, |
1680 | list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); | 1675 | sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); |
1681 | |||
1682 | if (INDEX_AC != INDEX_L3) { | ||
1683 | sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
1684 | sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; | ||
1685 | sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; | ||
1686 | sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; | ||
1687 | sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; | ||
1688 | __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | ||
1689 | list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); | ||
1690 | } | ||
1691 | 1676 | ||
1692 | slab_early_init = 0; | 1677 | slab_early_init = 0; |
1693 | 1678 | ||
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void) | |||
1699 | * Note for systems short on memory removing the alignment will | 1684 | * Note for systems short on memory removing the alignment will |
1700 | * allow tighter packing of the smaller caches. | 1685 | * allow tighter packing of the smaller caches. |
1701 | */ | 1686 | */ |
1702 | if (!sizes->cs_cachep) { | 1687 | if (!sizes->cs_cachep) |
1703 | sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 1688 | sizes->cs_cachep = create_kmalloc_cache(names->name, |
1704 | sizes->cs_cachep->name = names->name; | 1689 | sizes->cs_size, ARCH_KMALLOC_FLAGS); |
1705 | sizes->cs_cachep->size = sizes->cs_size; | 1690 | |
1706 | sizes->cs_cachep->object_size = sizes->cs_size; | ||
1707 | sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; | ||
1708 | __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | ||
1709 | list_add(&sizes->cs_cachep->list, &slab_caches); | ||
1710 | } | ||
1711 | #ifdef CONFIG_ZONE_DMA | 1691 | #ifdef CONFIG_ZONE_DMA |
1712 | sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 1692 | sizes->cs_dmacachep = create_kmalloc_cache( |
1713 | sizes->cs_dmacachep->name = names->name_dma; | 1693 | names->name_dma, sizes->cs_size, |
1714 | sizes->cs_dmacachep->size = sizes->cs_size; | 1694 | SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); |
1715 | sizes->cs_dmacachep->object_size = sizes->cs_size; | ||
1716 | sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; | ||
1717 | __kmem_cache_create(sizes->cs_dmacachep, | ||
1718 | ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); | ||
1719 | list_add(&sizes->cs_dmacachep->list, &slab_caches); | ||
1720 | #endif | 1695 | #endif |
1721 | sizes++; | 1696 | sizes++; |
1722 | names++; | 1697 | names++; |
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void) | |||
1727 | 1702 | ||
1728 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | 1703 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1729 | 1704 | ||
1730 | BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); | ||
1731 | memcpy(ptr, cpu_cache_get(kmem_cache), | 1705 | memcpy(ptr, cpu_cache_get(kmem_cache), |
1732 | sizeof(struct arraycache_init)); | 1706 | sizeof(struct arraycache_init)); |
1733 | /* | 1707 | /* |
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1921 | if (page->pfmemalloc) | 1895 | if (page->pfmemalloc) |
1922 | SetPageSlabPfmemalloc(page + i); | 1896 | SetPageSlabPfmemalloc(page + i); |
1923 | } | 1897 | } |
1898 | memcg_bind_pages(cachep, cachep->gfporder); | ||
1924 | 1899 | ||
1925 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1900 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1926 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1901 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1957 | __ClearPageSlab(page); | 1932 | __ClearPageSlab(page); |
1958 | page++; | 1933 | page++; |
1959 | } | 1934 | } |
1935 | |||
1936 | memcg_release_pages(cachep, cachep->gfporder); | ||
1960 | if (current->reclaim_state) | 1937 | if (current->reclaim_state) |
1961 | current->reclaim_state->reclaimed_slab += nr_freed; | 1938 | current->reclaim_state->reclaimed_slab += nr_freed; |
1962 | free_pages((unsigned long)addr, cachep->gfporder); | 1939 | free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); |
1963 | } | 1940 | } |
1964 | 1941 | ||
1965 | static void kmem_rcu_free(struct rcu_head *head) | 1942 | static void kmem_rcu_free(struct rcu_head *head) |
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2282 | 2259 | ||
2283 | if (slab_state == DOWN) { | 2260 | if (slab_state == DOWN) { |
2284 | /* | 2261 | /* |
2285 | * Note: the first kmem_cache_create must create the cache | 2262 | * Note: Creation of first cache (kmem_cache). |
2263 | * The setup_list3s is taken care | ||
2264 | * of by the caller of __kmem_cache_create | ||
2265 | */ | ||
2266 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
2267 | slab_state = PARTIAL; | ||
2268 | } else if (slab_state == PARTIAL) { | ||
2269 | /* | ||
2270 | * Note: the second kmem_cache_create must create the cache | ||
2286 | * that's used by kmalloc(24), otherwise the creation of | 2271 | * that's used by kmalloc(24), otherwise the creation of |
2287 | * further caches will BUG(). | 2272 | * further caches will BUG(). |
2288 | */ | 2273 | */ |
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2290 | 2275 | ||
2291 | /* | 2276 | /* |
2292 | * If the cache that's used by kmalloc(sizeof(kmem_list3)) is | 2277 | * If the cache that's used by kmalloc(sizeof(kmem_list3)) is |
2293 | * the first cache, then we need to set up all its list3s, | 2278 | * the second cache, then we need to set up all its list3s, |
2294 | * otherwise the creation of further caches will BUG(). | 2279 | * otherwise the creation of further caches will BUG(). |
2295 | */ | 2280 | */ |
2296 | set_up_list3s(cachep, SIZE_AC); | 2281 | set_up_list3s(cachep, SIZE_AC); |
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2299 | else | 2284 | else |
2300 | slab_state = PARTIAL_ARRAYCACHE; | 2285 | slab_state = PARTIAL_ARRAYCACHE; |
2301 | } else { | 2286 | } else { |
2287 | /* Remaining boot caches */ | ||
2302 | cachep->array[smp_processor_id()] = | 2288 | cachep->array[smp_processor_id()] = |
2303 | kmalloc(sizeof(struct arraycache_init), gfp); | 2289 | kmalloc(sizeof(struct arraycache_init), gfp); |
2304 | 2290 | ||
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2331 | 2317 | ||
2332 | /** | 2318 | /** |
2333 | * __kmem_cache_create - Create a cache. | 2319 | * __kmem_cache_create - Create a cache. |
2334 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 2320 | * @cachep: cache management descriptor |
2335 | * @size: The size of objects to be created in this cache. | ||
2336 | * @align: The required alignment for the objects. | ||
2337 | * @flags: SLAB flags | 2321 | * @flags: SLAB flags |
2338 | * @ctor: A constructor for the objects. | ||
2339 | * | 2322 | * |
2340 | * Returns a ptr to the cache on success, NULL on failure. | 2323 | * Returns a ptr to the cache on success, NULL on failure. |
2341 | * Cannot be called within a int, but can be interrupted. | 2324 | * Cannot be called within a int, but can be interrupted. |
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2378 | if (flags & SLAB_DESTROY_BY_RCU) | 2361 | if (flags & SLAB_DESTROY_BY_RCU) |
2379 | BUG_ON(flags & SLAB_POISON); | 2362 | BUG_ON(flags & SLAB_POISON); |
2380 | #endif | 2363 | #endif |
2381 | /* | ||
2382 | * Always checks flags, a caller might be expecting debug support which | ||
2383 | * isn't available. | ||
2384 | */ | ||
2385 | BUG_ON(flags & ~CREATE_MASK); | ||
2386 | 2364 | ||
2387 | /* | 2365 | /* |
2388 | * Check that size is in terms of words. This is needed to avoid | 2366 | * Check that size is in terms of words. This is needed to avoid |
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2394 | size &= ~(BYTES_PER_WORD - 1); | 2372 | size &= ~(BYTES_PER_WORD - 1); |
2395 | } | 2373 | } |
2396 | 2374 | ||
2397 | /* calculate the final buffer alignment: */ | ||
2398 | |||
2399 | /* 1) arch recommendation: can be overridden for debug */ | ||
2400 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
2401 | /* | ||
2402 | * Default alignment: as specified by the arch code. Except if | ||
2403 | * an object is really small, then squeeze multiple objects into | ||
2404 | * one cacheline. | ||
2405 | */ | ||
2406 | ralign = cache_line_size(); | ||
2407 | while (size <= ralign / 2) | ||
2408 | ralign /= 2; | ||
2409 | } else { | ||
2410 | ralign = BYTES_PER_WORD; | ||
2411 | } | ||
2412 | |||
2413 | /* | 2375 | /* |
2414 | * Redzoning and user store require word alignment or possibly larger. | 2376 | * Redzoning and user store require word alignment or possibly larger. |
2415 | * Note this will be overridden by architecture or caller mandated | 2377 | * Note this will be overridden by architecture or caller mandated |
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2426 | size &= ~(REDZONE_ALIGN - 1); | 2388 | size &= ~(REDZONE_ALIGN - 1); |
2427 | } | 2389 | } |
2428 | 2390 | ||
2429 | /* 2) arch mandated alignment */ | ||
2430 | if (ralign < ARCH_SLAB_MINALIGN) { | ||
2431 | ralign = ARCH_SLAB_MINALIGN; | ||
2432 | } | ||
2433 | /* 3) caller mandated alignment */ | 2391 | /* 3) caller mandated alignment */ |
2434 | if (ralign < cachep->align) { | 2392 | if (ralign < cachep->align) { |
2435 | ralign = cachep->align; | 2393 | ralign = cachep->align; |
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2447 | else | 2405 | else |
2448 | gfp = GFP_NOWAIT; | 2406 | gfp = GFP_NOWAIT; |
2449 | 2407 | ||
2450 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | 2408 | setup_nodelists_pointer(cachep); |
2451 | #if DEBUG | 2409 | #if DEBUG |
2452 | 2410 | ||
2453 | /* | 2411 | /* |
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2566 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); | 2524 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); |
2567 | 2525 | ||
2568 | slab_set_debugobj_lock_classes(cachep); | 2526 | slab_set_debugobj_lock_classes(cachep); |
2569 | } | 2527 | } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) |
2528 | on_slab_lock_classes(cachep); | ||
2570 | 2529 | ||
2571 | return 0; | 2530 | return 0; |
2572 | } | 2531 | } |
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3530 | if (slab_should_failslab(cachep, flags)) | 3489 | if (slab_should_failslab(cachep, flags)) |
3531 | return NULL; | 3490 | return NULL; |
3532 | 3491 | ||
3492 | cachep = memcg_kmem_get_cache(cachep, flags); | ||
3493 | |||
3533 | cache_alloc_debugcheck_before(cachep, flags); | 3494 | cache_alloc_debugcheck_before(cachep, flags); |
3534 | local_irq_save(save_flags); | 3495 | local_irq_save(save_flags); |
3535 | 3496 | ||
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
3615 | if (slab_should_failslab(cachep, flags)) | 3576 | if (slab_should_failslab(cachep, flags)) |
3616 | return NULL; | 3577 | return NULL; |
3617 | 3578 | ||
3579 | cachep = memcg_kmem_get_cache(cachep, flags); | ||
3580 | |||
3618 | cache_alloc_debugcheck_before(cachep, flags); | 3581 | cache_alloc_debugcheck_before(cachep, flags); |
3619 | local_irq_save(save_flags); | 3582 | local_irq_save(save_flags); |
3620 | objp = __do_cache_alloc(cachep, flags); | 3583 | objp = __do_cache_alloc(cachep, flags); |
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc); | |||
3928 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) | 3891 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) |
3929 | { | 3892 | { |
3930 | unsigned long flags; | 3893 | unsigned long flags; |
3894 | cachep = cache_from_obj(cachep, objp); | ||
3895 | if (!cachep) | ||
3896 | return; | ||
3931 | 3897 | ||
3932 | local_irq_save(flags); | 3898 | local_irq_save(flags); |
3933 | debug_check_no_locks_freed(objp, cachep->object_size); | 3899 | debug_check_no_locks_freed(objp, cachep->object_size); |
@@ -3969,12 +3935,6 @@ void kfree(const void *objp) | |||
3969 | } | 3935 | } |
3970 | EXPORT_SYMBOL(kfree); | 3936 | EXPORT_SYMBOL(kfree); |
3971 | 3937 | ||
3972 | unsigned int kmem_cache_size(struct kmem_cache *cachep) | ||
3973 | { | ||
3974 | return cachep->object_size; | ||
3975 | } | ||
3976 | EXPORT_SYMBOL(kmem_cache_size); | ||
3977 | |||
3978 | /* | 3938 | /* |
3979 | * This initializes kmem_list3 or resizes various caches for all nodes. | 3939 | * This initializes kmem_list3 or resizes various caches for all nodes. |
3980 | */ | 3940 | */ |
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info) | |||
4081 | } | 4041 | } |
4082 | 4042 | ||
4083 | /* Always called with the slab_mutex held */ | 4043 | /* Always called with the slab_mutex held */ |
4084 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 4044 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, |
4085 | int batchcount, int shared, gfp_t gfp) | 4045 | int batchcount, int shared, gfp_t gfp) |
4086 | { | 4046 | { |
4087 | struct ccupdate_struct *new; | 4047 | struct ccupdate_struct *new; |
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
4124 | return alloc_kmemlist(cachep, gfp); | 4084 | return alloc_kmemlist(cachep, gfp); |
4125 | } | 4085 | } |
4126 | 4086 | ||
4087 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | ||
4088 | int batchcount, int shared, gfp_t gfp) | ||
4089 | { | ||
4090 | int ret; | ||
4091 | struct kmem_cache *c = NULL; | ||
4092 | int i = 0; | ||
4093 | |||
4094 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | ||
4095 | |||
4096 | if (slab_state < FULL) | ||
4097 | return ret; | ||
4098 | |||
4099 | if ((ret < 0) || !is_root_cache(cachep)) | ||
4100 | return ret; | ||
4101 | |||
4102 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); | ||
4103 | for_each_memcg_cache_index(i) { | ||
4104 | c = cache_from_memcg(cachep, i); | ||
4105 | if (c) | ||
4106 | /* return value determined by the parent cache only */ | ||
4107 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); | ||
4108 | } | ||
4109 | |||
4110 | return ret; | ||
4111 | } | ||
4112 | |||
4127 | /* Called with slab_mutex held always */ | 4113 | /* Called with slab_mutex held always */ |
4128 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | 4114 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
4129 | { | 4115 | { |
4130 | int err; | 4116 | int err; |
4131 | int limit, shared; | 4117 | int limit = 0; |
4118 | int shared = 0; | ||
4119 | int batchcount = 0; | ||
4120 | |||
4121 | if (!is_root_cache(cachep)) { | ||
4122 | struct kmem_cache *root = memcg_root_cache(cachep); | ||
4123 | limit = root->limit; | ||
4124 | shared = root->shared; | ||
4125 | batchcount = root->batchcount; | ||
4126 | } | ||
4132 | 4127 | ||
4128 | if (limit && shared && batchcount) | ||
4129 | goto skip_setup; | ||
4133 | /* | 4130 | /* |
4134 | * The head array serves three purposes: | 4131 | * The head array serves three purposes: |
4135 | * - create a LIFO ordering, i.e. return objects that are cache-warm | 4132 | * - create a LIFO ordering, i.e. return objects that are cache-warm |
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
4171 | if (limit > 32) | 4168 | if (limit > 32) |
4172 | limit = 32; | 4169 | limit = 32; |
4173 | #endif | 4170 | #endif |
4174 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); | 4171 | batchcount = (limit + 1) / 2; |
4172 | skip_setup: | ||
4173 | err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | ||
4175 | if (err) | 4174 | if (err) |
4176 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 4175 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
4177 | cachep->name, -err); | 4176 | cachep->name, -err); |
@@ -4276,54 +4275,8 @@ out: | |||
4276 | } | 4275 | } |
4277 | 4276 | ||
4278 | #ifdef CONFIG_SLABINFO | 4277 | #ifdef CONFIG_SLABINFO |
4279 | 4278 | void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) | |
4280 | static void print_slabinfo_header(struct seq_file *m) | ||
4281 | { | ||
4282 | /* | ||
4283 | * Output format version, so at least we can change it | ||
4284 | * without _too_ many complaints. | ||
4285 | */ | ||
4286 | #if STATS | ||
4287 | seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); | ||
4288 | #else | ||
4289 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
4290 | #endif | ||
4291 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " | ||
4292 | "<objperslab> <pagesperslab>"); | ||
4293 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
4294 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
4295 | #if STATS | ||
4296 | seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " | ||
4297 | "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); | ||
4298 | seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); | ||
4299 | #endif | ||
4300 | seq_putc(m, '\n'); | ||
4301 | } | ||
4302 | |||
4303 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
4304 | { | ||
4305 | loff_t n = *pos; | ||
4306 | |||
4307 | mutex_lock(&slab_mutex); | ||
4308 | if (!n) | ||
4309 | print_slabinfo_header(m); | ||
4310 | |||
4311 | return seq_list_start(&slab_caches, *pos); | ||
4312 | } | ||
4313 | |||
4314 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
4315 | { | 4279 | { |
4316 | return seq_list_next(p, &slab_caches, pos); | ||
4317 | } | ||
4318 | |||
4319 | static void s_stop(struct seq_file *m, void *p) | ||
4320 | { | ||
4321 | mutex_unlock(&slab_mutex); | ||
4322 | } | ||
4323 | |||
4324 | static int s_show(struct seq_file *m, void *p) | ||
4325 | { | ||
4326 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); | ||
4327 | struct slab *slabp; | 4280 | struct slab *slabp; |
4328 | unsigned long active_objs; | 4281 | unsigned long active_objs; |
4329 | unsigned long num_objs; | 4282 | unsigned long num_objs; |
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p) | |||
4378 | if (error) | 4331 | if (error) |
4379 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); | 4332 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); |
4380 | 4333 | ||
4381 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | 4334 | sinfo->active_objs = active_objs; |
4382 | name, active_objs, num_objs, cachep->size, | 4335 | sinfo->num_objs = num_objs; |
4383 | cachep->num, (1 << cachep->gfporder)); | 4336 | sinfo->active_slabs = active_slabs; |
4384 | seq_printf(m, " : tunables %4u %4u %4u", | 4337 | sinfo->num_slabs = num_slabs; |
4385 | cachep->limit, cachep->batchcount, cachep->shared); | 4338 | sinfo->shared_avail = shared_avail; |
4386 | seq_printf(m, " : slabdata %6lu %6lu %6lu", | 4339 | sinfo->limit = cachep->limit; |
4387 | active_slabs, num_slabs, shared_avail); | 4340 | sinfo->batchcount = cachep->batchcount; |
4341 | sinfo->shared = cachep->shared; | ||
4342 | sinfo->objects_per_slab = cachep->num; | ||
4343 | sinfo->cache_order = cachep->gfporder; | ||
4344 | } | ||
4345 | |||
4346 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) | ||
4347 | { | ||
4388 | #if STATS | 4348 | #if STATS |
4389 | { /* list3 stats */ | 4349 | { /* list3 stats */ |
4390 | unsigned long high = cachep->high_mark; | 4350 | unsigned long high = cachep->high_mark; |
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p) | |||
4414 | allochit, allocmiss, freehit, freemiss); | 4374 | allochit, allocmiss, freehit, freemiss); |
4415 | } | 4375 | } |
4416 | #endif | 4376 | #endif |
4417 | seq_putc(m, '\n'); | ||
4418 | return 0; | ||
4419 | } | 4377 | } |
4420 | 4378 | ||
4421 | /* | ||
4422 | * slabinfo_op - iterator that generates /proc/slabinfo | ||
4423 | * | ||
4424 | * Output layout: | ||
4425 | * cache-name | ||
4426 | * num-active-objs | ||
4427 | * total-objs | ||
4428 | * object size | ||
4429 | * num-active-slabs | ||
4430 | * total-slabs | ||
4431 | * num-pages-per-slab | ||
4432 | * + further values on SMP and with statistics enabled | ||
4433 | */ | ||
4434 | |||
4435 | static const struct seq_operations slabinfo_op = { | ||
4436 | .start = s_start, | ||
4437 | .next = s_next, | ||
4438 | .stop = s_stop, | ||
4439 | .show = s_show, | ||
4440 | }; | ||
4441 | |||
4442 | #define MAX_SLABINFO_WRITE 128 | 4379 | #define MAX_SLABINFO_WRITE 128 |
4443 | /** | 4380 | /** |
4444 | * slabinfo_write - Tuning for the slab allocator | 4381 | * slabinfo_write - Tuning for the slab allocator |
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = { | |||
4447 | * @count: data length | 4384 | * @count: data length |
4448 | * @ppos: unused | 4385 | * @ppos: unused |
4449 | */ | 4386 | */ |
4450 | static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | 4387 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
4451 | size_t count, loff_t *ppos) | 4388 | size_t count, loff_t *ppos) |
4452 | { | 4389 | { |
4453 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; | 4390 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; |
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4490 | return res; | 4427 | return res; |
4491 | } | 4428 | } |
4492 | 4429 | ||
4493 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
4494 | { | ||
4495 | return seq_open(file, &slabinfo_op); | ||
4496 | } | ||
4497 | |||
4498 | static const struct file_operations proc_slabinfo_operations = { | ||
4499 | .open = slabinfo_open, | ||
4500 | .read = seq_read, | ||
4501 | .write = slabinfo_write, | ||
4502 | .llseek = seq_lseek, | ||
4503 | .release = seq_release, | ||
4504 | }; | ||
4505 | |||
4506 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4430 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4507 | 4431 | ||
4508 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4432 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4631 | return 0; | 4555 | return 0; |
4632 | } | 4556 | } |
4633 | 4557 | ||
4558 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
4559 | { | ||
4560 | return seq_list_next(p, &slab_caches, pos); | ||
4561 | } | ||
4562 | |||
4563 | static void s_stop(struct seq_file *m, void *p) | ||
4564 | { | ||
4565 | mutex_unlock(&slab_mutex); | ||
4566 | } | ||
4567 | |||
4634 | static const struct seq_operations slabstats_op = { | 4568 | static const struct seq_operations slabstats_op = { |
4635 | .start = leaks_start, | 4569 | .start = leaks_start, |
4636 | .next = s_next, | 4570 | .next = s_next, |
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = { | |||
4665 | 4599 | ||
4666 | static int __init slab_proc_init(void) | 4600 | static int __init slab_proc_init(void) |
4667 | { | 4601 | { |
4668 | proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); | ||
4669 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4602 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4670 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); | 4603 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); |
4671 | #endif | 4604 | #endif |
@@ -32,19 +32,201 @@ extern struct list_head slab_caches; | |||
32 | /* The slab cache that manages slab cache information */ | 32 | /* The slab cache that manages slab cache information */ |
33 | extern struct kmem_cache *kmem_cache; | 33 | extern struct kmem_cache *kmem_cache; |
34 | 34 | ||
35 | unsigned long calculate_alignment(unsigned long flags, | ||
36 | unsigned long align, unsigned long size); | ||
37 | |||
35 | /* Functions provided by the slab allocators */ | 38 | /* Functions provided by the slab allocators */ |
36 | extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); | 39 | extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); |
37 | 40 | ||
41 | extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, | ||
42 | unsigned long flags); | ||
43 | extern void create_boot_cache(struct kmem_cache *, const char *name, | ||
44 | size_t size, unsigned long flags); | ||
45 | |||
46 | struct mem_cgroup; | ||
38 | #ifdef CONFIG_SLUB | 47 | #ifdef CONFIG_SLUB |
39 | struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | 48 | struct kmem_cache * |
40 | size_t align, unsigned long flags, void (*ctor)(void *)); | 49 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, |
50 | size_t align, unsigned long flags, void (*ctor)(void *)); | ||
41 | #else | 51 | #else |
42 | static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | 52 | static inline struct kmem_cache * |
43 | size_t align, unsigned long flags, void (*ctor)(void *)) | 53 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, |
54 | size_t align, unsigned long flags, void (*ctor)(void *)) | ||
44 | { return NULL; } | 55 | { return NULL; } |
45 | #endif | 56 | #endif |
46 | 57 | ||
47 | 58 | ||
59 | /* Legal flag mask for kmem_cache_create(), for various configurations */ | ||
60 | #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ | ||
61 | SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) | ||
62 | |||
63 | #if defined(CONFIG_DEBUG_SLAB) | ||
64 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | ||
65 | #elif defined(CONFIG_SLUB_DEBUG) | ||
66 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
67 | SLAB_TRACE | SLAB_DEBUG_FREE) | ||
68 | #else | ||
69 | #define SLAB_DEBUG_FLAGS (0) | ||
70 | #endif | ||
71 | |||
72 | #if defined(CONFIG_SLAB) | ||
73 | #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ | ||
74 | SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) | ||
75 | #elif defined(CONFIG_SLUB) | ||
76 | #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ | ||
77 | SLAB_TEMPORARY | SLAB_NOTRACK) | ||
78 | #else | ||
79 | #define SLAB_CACHE_FLAGS (0) | ||
80 | #endif | ||
81 | |||
82 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) | ||
83 | |||
48 | int __kmem_cache_shutdown(struct kmem_cache *); | 84 | int __kmem_cache_shutdown(struct kmem_cache *); |
49 | 85 | ||
86 | struct seq_file; | ||
87 | struct file; | ||
88 | |||
89 | struct slabinfo { | ||
90 | unsigned long active_objs; | ||
91 | unsigned long num_objs; | ||
92 | unsigned long active_slabs; | ||
93 | unsigned long num_slabs; | ||
94 | unsigned long shared_avail; | ||
95 | unsigned int limit; | ||
96 | unsigned int batchcount; | ||
97 | unsigned int shared; | ||
98 | unsigned int objects_per_slab; | ||
99 | unsigned int cache_order; | ||
100 | }; | ||
101 | |||
102 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); | ||
103 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); | ||
104 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | ||
105 | size_t count, loff_t *ppos); | ||
106 | |||
107 | #ifdef CONFIG_MEMCG_KMEM | ||
108 | static inline bool is_root_cache(struct kmem_cache *s) | ||
109 | { | ||
110 | return !s->memcg_params || s->memcg_params->is_root_cache; | ||
111 | } | ||
112 | |||
113 | static inline bool cache_match_memcg(struct kmem_cache *cachep, | ||
114 | struct mem_cgroup *memcg) | ||
115 | { | ||
116 | return (is_root_cache(cachep) && !memcg) || | ||
117 | (cachep->memcg_params->memcg == memcg); | ||
118 | } | ||
119 | |||
120 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | ||
121 | { | ||
122 | if (!is_root_cache(s)) | ||
123 | atomic_add(1 << order, &s->memcg_params->nr_pages); | ||
124 | } | ||
125 | |||
126 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | ||
127 | { | ||
128 | if (is_root_cache(s)) | ||
129 | return; | ||
130 | |||
131 | if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) | ||
132 | mem_cgroup_destroy_cache(s); | ||
133 | } | ||
134 | |||
135 | static inline bool slab_equal_or_root(struct kmem_cache *s, | ||
136 | struct kmem_cache *p) | ||
137 | { | ||
138 | return (p == s) || | ||
139 | (s->memcg_params && (p == s->memcg_params->root_cache)); | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * We use suffixes to the name in memcg because we can't have caches | ||
144 | * created in the system with the same name. But when we print them | ||
145 | * locally, better refer to them with the base name | ||
146 | */ | ||
147 | static inline const char *cache_name(struct kmem_cache *s) | ||
148 | { | ||
149 | if (!is_root_cache(s)) | ||
150 | return s->memcg_params->root_cache->name; | ||
151 | return s->name; | ||
152 | } | ||
153 | |||
154 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | ||
155 | { | ||
156 | return s->memcg_params->memcg_caches[idx]; | ||
157 | } | ||
158 | |||
159 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | ||
160 | { | ||
161 | if (is_root_cache(s)) | ||
162 | return s; | ||
163 | return s->memcg_params->root_cache; | ||
164 | } | ||
165 | #else | ||
166 | static inline bool is_root_cache(struct kmem_cache *s) | ||
167 | { | ||
168 | return true; | ||
169 | } | ||
170 | |||
171 | static inline bool cache_match_memcg(struct kmem_cache *cachep, | ||
172 | struct mem_cgroup *memcg) | ||
173 | { | ||
174 | return true; | ||
175 | } | ||
176 | |||
177 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | ||
178 | { | ||
179 | } | ||
180 | |||
181 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | ||
182 | { | ||
183 | } | ||
184 | |||
185 | static inline bool slab_equal_or_root(struct kmem_cache *s, | ||
186 | struct kmem_cache *p) | ||
187 | { | ||
188 | return true; | ||
189 | } | ||
190 | |||
191 | static inline const char *cache_name(struct kmem_cache *s) | ||
192 | { | ||
193 | return s->name; | ||
194 | } | ||
195 | |||
196 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | ||
197 | { | ||
198 | return NULL; | ||
199 | } | ||
200 | |||
201 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | ||
202 | { | ||
203 | return s; | ||
204 | } | ||
205 | #endif | ||
206 | |||
207 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | ||
208 | { | ||
209 | struct kmem_cache *cachep; | ||
210 | struct page *page; | ||
211 | |||
212 | /* | ||
213 | * When kmemcg is not being used, both assignments should return the | ||
214 | * same value. but we don't want to pay the assignment price in that | ||
215 | * case. If it is not compiled in, the compiler should be smart enough | ||
216 | * to not do even the assignment. In that case, slab_equal_or_root | ||
217 | * will also be a constant. | ||
218 | */ | ||
219 | if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) | ||
220 | return s; | ||
221 | |||
222 | page = virt_to_head_page(x); | ||
223 | cachep = page->slab_cache; | ||
224 | if (slab_equal_or_root(cachep, s)) | ||
225 | return cachep; | ||
226 | |||
227 | pr_err("%s: Wrong slab cache. %s but object is from %s\n", | ||
228 | __FUNCTION__, cachep->name, s->name); | ||
229 | WARN_ON_ONCE(1); | ||
230 | return s; | ||
231 | } | ||
50 | #endif | 232 | #endif |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 069a24e64403..3f3cd97d3fdf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -13,9 +13,12 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
16 | #include <linux/seq_file.h> | ||
17 | #include <linux/proc_fs.h> | ||
16 | #include <asm/cacheflush.h> | 18 | #include <asm/cacheflush.h> |
17 | #include <asm/tlbflush.h> | 19 | #include <asm/tlbflush.h> |
18 | #include <asm/page.h> | 20 | #include <asm/page.h> |
21 | #include <linux/memcontrol.h> | ||
19 | 22 | ||
20 | #include "slab.h" | 23 | #include "slab.h" |
21 | 24 | ||
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex); | |||
25 | struct kmem_cache *kmem_cache; | 28 | struct kmem_cache *kmem_cache; |
26 | 29 | ||
27 | #ifdef CONFIG_DEBUG_VM | 30 | #ifdef CONFIG_DEBUG_VM |
28 | static int kmem_cache_sanity_check(const char *name, size_t size) | 31 | static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, |
32 | size_t size) | ||
29 | { | 33 | { |
30 | struct kmem_cache *s = NULL; | 34 | struct kmem_cache *s = NULL; |
31 | 35 | ||
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size) | |||
51 | continue; | 55 | continue; |
52 | } | 56 | } |
53 | 57 | ||
54 | if (!strcmp(s->name, name)) { | 58 | /* |
59 | * For simplicity, we won't check this in the list of memcg | ||
60 | * caches. We have control over memcg naming, and if there | ||
61 | * aren't duplicates in the global list, there won't be any | ||
62 | * duplicates in the memcg lists as well. | ||
63 | */ | ||
64 | if (!memcg && !strcmp(s->name, name)) { | ||
55 | pr_err("%s (%s): Cache name already exists.\n", | 65 | pr_err("%s (%s): Cache name already exists.\n", |
56 | __func__, name); | 66 | __func__, name); |
57 | dump_stack(); | 67 | dump_stack(); |
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size) | |||
64 | return 0; | 74 | return 0; |
65 | } | 75 | } |
66 | #else | 76 | #else |
67 | static inline int kmem_cache_sanity_check(const char *name, size_t size) | 77 | static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, |
78 | const char *name, size_t size) | ||
68 | { | 79 | { |
69 | return 0; | 80 | return 0; |
70 | } | 81 | } |
71 | #endif | 82 | #endif |
72 | 83 | ||
84 | #ifdef CONFIG_MEMCG_KMEM | ||
85 | int memcg_update_all_caches(int num_memcgs) | ||
86 | { | ||
87 | struct kmem_cache *s; | ||
88 | int ret = 0; | ||
89 | mutex_lock(&slab_mutex); | ||
90 | |||
91 | list_for_each_entry(s, &slab_caches, list) { | ||
92 | if (!is_root_cache(s)) | ||
93 | continue; | ||
94 | |||
95 | ret = memcg_update_cache_size(s, num_memcgs); | ||
96 | /* | ||
97 | * See comment in memcontrol.c, memcg_update_cache_size: | ||
98 | * Instead of freeing the memory, we'll just leave the caches | ||
99 | * up to this point in an updated state. | ||
100 | */ | ||
101 | if (ret) | ||
102 | goto out; | ||
103 | } | ||
104 | |||
105 | memcg_update_array_size(num_memcgs); | ||
106 | out: | ||
107 | mutex_unlock(&slab_mutex); | ||
108 | return ret; | ||
109 | } | ||
110 | #endif | ||
111 | |||
112 | /* | ||
113 | * Figure out what the alignment of the objects will be given a set of | ||
114 | * flags, a user specified alignment and the size of the objects. | ||
115 | */ | ||
116 | unsigned long calculate_alignment(unsigned long flags, | ||
117 | unsigned long align, unsigned long size) | ||
118 | { | ||
119 | /* | ||
120 | * If the user wants hardware cache aligned objects then follow that | ||
121 | * suggestion if the object is sufficiently large. | ||
122 | * | ||
123 | * The hardware cache alignment cannot override the specified | ||
124 | * alignment though. If that is greater then use it. | ||
125 | */ | ||
126 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
127 | unsigned long ralign = cache_line_size(); | ||
128 | while (size <= ralign / 2) | ||
129 | ralign /= 2; | ||
130 | align = max(align, ralign); | ||
131 | } | ||
132 | |||
133 | if (align < ARCH_SLAB_MINALIGN) | ||
134 | align = ARCH_SLAB_MINALIGN; | ||
135 | |||
136 | return ALIGN(align, sizeof(void *)); | ||
137 | } | ||
138 | |||
139 | |||
73 | /* | 140 | /* |
74 | * kmem_cache_create - Create a cache. | 141 | * kmem_cache_create - Create a cache. |
75 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 142 | * @name: A string which is used in /proc/slabinfo to identify this cache. |
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) | |||
95 | * as davem. | 162 | * as davem. |
96 | */ | 163 | */ |
97 | 164 | ||
98 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, | 165 | struct kmem_cache * |
99 | unsigned long flags, void (*ctor)(void *)) | 166 | kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, |
167 | size_t align, unsigned long flags, void (*ctor)(void *), | ||
168 | struct kmem_cache *parent_cache) | ||
100 | { | 169 | { |
101 | struct kmem_cache *s = NULL; | 170 | struct kmem_cache *s = NULL; |
102 | int err = 0; | 171 | int err = 0; |
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align | |||
104 | get_online_cpus(); | 173 | get_online_cpus(); |
105 | mutex_lock(&slab_mutex); | 174 | mutex_lock(&slab_mutex); |
106 | 175 | ||
107 | if (!kmem_cache_sanity_check(name, size) == 0) | 176 | if (!kmem_cache_sanity_check(memcg, name, size) == 0) |
108 | goto out_locked; | 177 | goto out_locked; |
109 | 178 | ||
179 | /* | ||
180 | * Some allocators will constraint the set of valid flags to a subset | ||
181 | * of all flags. We expect them to define CACHE_CREATE_MASK in this | ||
182 | * case, and we'll just provide them with a sanitized version of the | ||
183 | * passed flags. | ||
184 | */ | ||
185 | flags &= CACHE_CREATE_MASK; | ||
110 | 186 | ||
111 | s = __kmem_cache_alias(name, size, align, flags, ctor); | 187 | s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); |
112 | if (s) | 188 | if (s) |
113 | goto out_locked; | 189 | goto out_locked; |
114 | 190 | ||
115 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); | 191 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); |
116 | if (s) { | 192 | if (s) { |
117 | s->object_size = s->size = size; | 193 | s->object_size = s->size = size; |
118 | s->align = align; | 194 | s->align = calculate_alignment(flags, align, size); |
119 | s->ctor = ctor; | 195 | s->ctor = ctor; |
196 | |||
197 | if (memcg_register_cache(memcg, s, parent_cache)) { | ||
198 | kmem_cache_free(kmem_cache, s); | ||
199 | err = -ENOMEM; | ||
200 | goto out_locked; | ||
201 | } | ||
202 | |||
120 | s->name = kstrdup(name, GFP_KERNEL); | 203 | s->name = kstrdup(name, GFP_KERNEL); |
121 | if (!s->name) { | 204 | if (!s->name) { |
122 | kmem_cache_free(kmem_cache, s); | 205 | kmem_cache_free(kmem_cache, s); |
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align | |||
126 | 209 | ||
127 | err = __kmem_cache_create(s, flags); | 210 | err = __kmem_cache_create(s, flags); |
128 | if (!err) { | 211 | if (!err) { |
129 | |||
130 | s->refcount = 1; | 212 | s->refcount = 1; |
131 | list_add(&s->list, &slab_caches); | 213 | list_add(&s->list, &slab_caches); |
132 | 214 | memcg_cache_list_add(memcg, s); | |
133 | } else { | 215 | } else { |
134 | kfree(s->name); | 216 | kfree(s->name); |
135 | kmem_cache_free(kmem_cache, s); | 217 | kmem_cache_free(kmem_cache, s); |
@@ -157,10 +239,20 @@ out_locked: | |||
157 | 239 | ||
158 | return s; | 240 | return s; |
159 | } | 241 | } |
242 | |||
243 | struct kmem_cache * | ||
244 | kmem_cache_create(const char *name, size_t size, size_t align, | ||
245 | unsigned long flags, void (*ctor)(void *)) | ||
246 | { | ||
247 | return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); | ||
248 | } | ||
160 | EXPORT_SYMBOL(kmem_cache_create); | 249 | EXPORT_SYMBOL(kmem_cache_create); |
161 | 250 | ||
162 | void kmem_cache_destroy(struct kmem_cache *s) | 251 | void kmem_cache_destroy(struct kmem_cache *s) |
163 | { | 252 | { |
253 | /* Destroy all the children caches if we aren't a memcg cache */ | ||
254 | kmem_cache_destroy_memcg_children(s); | ||
255 | |||
164 | get_online_cpus(); | 256 | get_online_cpus(); |
165 | mutex_lock(&slab_mutex); | 257 | mutex_lock(&slab_mutex); |
166 | s->refcount--; | 258 | s->refcount--; |
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
172 | if (s->flags & SLAB_DESTROY_BY_RCU) | 264 | if (s->flags & SLAB_DESTROY_BY_RCU) |
173 | rcu_barrier(); | 265 | rcu_barrier(); |
174 | 266 | ||
267 | memcg_release_cache(s); | ||
175 | kfree(s->name); | 268 | kfree(s->name); |
176 | kmem_cache_free(kmem_cache, s); | 269 | kmem_cache_free(kmem_cache, s); |
177 | } else { | 270 | } else { |
@@ -192,3 +285,182 @@ int slab_is_available(void) | |||
192 | { | 285 | { |
193 | return slab_state >= UP; | 286 | return slab_state >= UP; |
194 | } | 287 | } |
288 | |||
289 | #ifndef CONFIG_SLOB | ||
290 | /* Create a cache during boot when no slab services are available yet */ | ||
291 | void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, | ||
292 | unsigned long flags) | ||
293 | { | ||
294 | int err; | ||
295 | |||
296 | s->name = name; | ||
297 | s->size = s->object_size = size; | ||
298 | s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); | ||
299 | err = __kmem_cache_create(s, flags); | ||
300 | |||
301 | if (err) | ||
302 | panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n", | ||
303 | name, size, err); | ||
304 | |||
305 | s->refcount = -1; /* Exempt from merging for now */ | ||
306 | } | ||
307 | |||
308 | struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, | ||
309 | unsigned long flags) | ||
310 | { | ||
311 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
312 | |||
313 | if (!s) | ||
314 | panic("Out of memory when creating slab %s\n", name); | ||
315 | |||
316 | create_boot_cache(s, name, size, flags); | ||
317 | list_add(&s->list, &slab_caches); | ||
318 | s->refcount = 1; | ||
319 | return s; | ||
320 | } | ||
321 | |||
322 | #endif /* !CONFIG_SLOB */ | ||
323 | |||
324 | |||
325 | #ifdef CONFIG_SLABINFO | ||
326 | void print_slabinfo_header(struct seq_file *m) | ||
327 | { | ||
328 | /* | ||
329 | * Output format version, so at least we can change it | ||
330 | * without _too_ many complaints. | ||
331 | */ | ||
332 | #ifdef CONFIG_DEBUG_SLAB | ||
333 | seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); | ||
334 | #else | ||
335 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
336 | #endif | ||
337 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " | ||
338 | "<objperslab> <pagesperslab>"); | ||
339 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
340 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
341 | #ifdef CONFIG_DEBUG_SLAB | ||
342 | seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " | ||
343 | "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); | ||
344 | seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); | ||
345 | #endif | ||
346 | seq_putc(m, '\n'); | ||
347 | } | ||
348 | |||
349 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
350 | { | ||
351 | loff_t n = *pos; | ||
352 | |||
353 | mutex_lock(&slab_mutex); | ||
354 | if (!n) | ||
355 | print_slabinfo_header(m); | ||
356 | |||
357 | return seq_list_start(&slab_caches, *pos); | ||
358 | } | ||
359 | |||
360 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
361 | { | ||
362 | return seq_list_next(p, &slab_caches, pos); | ||
363 | } | ||
364 | |||
365 | static void s_stop(struct seq_file *m, void *p) | ||
366 | { | ||
367 | mutex_unlock(&slab_mutex); | ||
368 | } | ||
369 | |||
370 | static void | ||
371 | memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | ||
372 | { | ||
373 | struct kmem_cache *c; | ||
374 | struct slabinfo sinfo; | ||
375 | int i; | ||
376 | |||
377 | if (!is_root_cache(s)) | ||
378 | return; | ||
379 | |||
380 | for_each_memcg_cache_index(i) { | ||
381 | c = cache_from_memcg(s, i); | ||
382 | if (!c) | ||
383 | continue; | ||
384 | |||
385 | memset(&sinfo, 0, sizeof(sinfo)); | ||
386 | get_slabinfo(c, &sinfo); | ||
387 | |||
388 | info->active_slabs += sinfo.active_slabs; | ||
389 | info->num_slabs += sinfo.num_slabs; | ||
390 | info->shared_avail += sinfo.shared_avail; | ||
391 | info->active_objs += sinfo.active_objs; | ||
392 | info->num_objs += sinfo.num_objs; | ||
393 | } | ||
394 | } | ||
395 | |||
396 | int cache_show(struct kmem_cache *s, struct seq_file *m) | ||
397 | { | ||
398 | struct slabinfo sinfo; | ||
399 | |||
400 | memset(&sinfo, 0, sizeof(sinfo)); | ||
401 | get_slabinfo(s, &sinfo); | ||
402 | |||
403 | memcg_accumulate_slabinfo(s, &sinfo); | ||
404 | |||
405 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | ||
406 | cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, | ||
407 | sinfo.objects_per_slab, (1 << sinfo.cache_order)); | ||
408 | |||
409 | seq_printf(m, " : tunables %4u %4u %4u", | ||
410 | sinfo.limit, sinfo.batchcount, sinfo.shared); | ||
411 | seq_printf(m, " : slabdata %6lu %6lu %6lu", | ||
412 | sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); | ||
413 | slabinfo_show_stats(m, s); | ||
414 | seq_putc(m, '\n'); | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static int s_show(struct seq_file *m, void *p) | ||
419 | { | ||
420 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); | ||
421 | |||
422 | if (!is_root_cache(s)) | ||
423 | return 0; | ||
424 | return cache_show(s, m); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * slabinfo_op - iterator that generates /proc/slabinfo | ||
429 | * | ||
430 | * Output layout: | ||
431 | * cache-name | ||
432 | * num-active-objs | ||
433 | * total-objs | ||
434 | * object size | ||
435 | * num-active-slabs | ||
436 | * total-slabs | ||
437 | * num-pages-per-slab | ||
438 | * + further values on SMP and with statistics enabled | ||
439 | */ | ||
440 | static const struct seq_operations slabinfo_op = { | ||
441 | .start = s_start, | ||
442 | .next = s_next, | ||
443 | .stop = s_stop, | ||
444 | .show = s_show, | ||
445 | }; | ||
446 | |||
447 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
448 | { | ||
449 | return seq_open(file, &slabinfo_op); | ||
450 | } | ||
451 | |||
452 | static const struct file_operations proc_slabinfo_operations = { | ||
453 | .open = slabinfo_open, | ||
454 | .read = seq_read, | ||
455 | .write = slabinfo_write, | ||
456 | .llseek = seq_lseek, | ||
457 | .release = seq_release, | ||
458 | }; | ||
459 | |||
460 | static int __init slab_proc_init(void) | ||
461 | { | ||
462 | proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); | ||
463 | return 0; | ||
464 | } | ||
465 | module_init(slab_proc_init); | ||
466 | #endif /* CONFIG_SLABINFO */ | ||
@@ -28,9 +28,8 @@ | |||
28 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. | 28 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. |
29 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls | 29 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls |
30 | * alloc_pages() directly, allocating compound pages so the page order | 30 | * alloc_pages() directly, allocating compound pages so the page order |
31 | * does not have to be separately tracked, and also stores the exact | 31 | * does not have to be separately tracked. |
32 | * allocation size in page->private so that it can be used to accurately | 32 | * These objects are detected in kfree() because PageSlab() |
33 | * provide ksize(). These objects are detected in kfree() because slob_page() | ||
34 | * is false for them. | 33 | * is false for them. |
35 | * | 34 | * |
36 | * SLAB is emulated on top of SLOB by simply calling constructors and | 35 | * SLAB is emulated on top of SLOB by simply calling constructors and |
@@ -59,7 +58,6 @@ | |||
59 | 58 | ||
60 | #include <linux/kernel.h> | 59 | #include <linux/kernel.h> |
61 | #include <linux/slab.h> | 60 | #include <linux/slab.h> |
62 | #include "slab.h" | ||
63 | 61 | ||
64 | #include <linux/mm.h> | 62 | #include <linux/mm.h> |
65 | #include <linux/swap.h> /* struct reclaim_state */ | 63 | #include <linux/swap.h> /* struct reclaim_state */ |
@@ -74,6 +72,7 @@ | |||
74 | 72 | ||
75 | #include <linux/atomic.h> | 73 | #include <linux/atomic.h> |
76 | 74 | ||
75 | #include "slab.h" | ||
77 | /* | 76 | /* |
78 | * slob_block has a field 'units', which indicates size of block if +ve, | 77 | * slob_block has a field 'units', which indicates size of block if +ve, |
79 | * or offset of next block if -ve (in SLOB_UNITs). | 78 | * or offset of next block if -ve (in SLOB_UNITs). |
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp) | |||
124 | 123 | ||
125 | #define SLOB_UNIT sizeof(slob_t) | 124 | #define SLOB_UNIT sizeof(slob_t) |
126 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) | 125 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) |
127 | #define SLOB_ALIGN L1_CACHE_BYTES | ||
128 | 126 | ||
129 | /* | 127 | /* |
130 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | 128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which |
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) | |||
455 | if (likely(order)) | 453 | if (likely(order)) |
456 | gfp |= __GFP_COMP; | 454 | gfp |= __GFP_COMP; |
457 | ret = slob_new_pages(gfp, order, node); | 455 | ret = slob_new_pages(gfp, order, node); |
458 | if (ret) { | ||
459 | struct page *page; | ||
460 | page = virt_to_page(ret); | ||
461 | page->private = size; | ||
462 | } | ||
463 | 456 | ||
464 | trace_kmalloc_node(caller, ret, | 457 | trace_kmalloc_node(caller, ret, |
465 | size, PAGE_SIZE << order, gfp, node); | 458 | size, PAGE_SIZE << order, gfp, node); |
@@ -506,7 +499,7 @@ void kfree(const void *block) | |||
506 | unsigned int *m = (unsigned int *)(block - align); | 499 | unsigned int *m = (unsigned int *)(block - align); |
507 | slob_free(m, *m + align); | 500 | slob_free(m, *m + align); |
508 | } else | 501 | } else |
509 | put_page(sp); | 502 | __free_pages(sp, compound_order(sp)); |
510 | } | 503 | } |
511 | EXPORT_SYMBOL(kfree); | 504 | EXPORT_SYMBOL(kfree); |
512 | 505 | ||
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree); | |||
514 | size_t ksize(const void *block) | 507 | size_t ksize(const void *block) |
515 | { | 508 | { |
516 | struct page *sp; | 509 | struct page *sp; |
510 | int align; | ||
511 | unsigned int *m; | ||
517 | 512 | ||
518 | BUG_ON(!block); | 513 | BUG_ON(!block); |
519 | if (unlikely(block == ZERO_SIZE_PTR)) | 514 | if (unlikely(block == ZERO_SIZE_PTR)) |
520 | return 0; | 515 | return 0; |
521 | 516 | ||
522 | sp = virt_to_page(block); | 517 | sp = virt_to_page(block); |
523 | if (PageSlab(sp)) { | 518 | if (unlikely(!PageSlab(sp))) |
524 | int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 519 | return PAGE_SIZE << compound_order(sp); |
525 | unsigned int *m = (unsigned int *)(block - align); | 520 | |
526 | return SLOB_UNITS(*m) * SLOB_UNIT; | 521 | align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
527 | } else | 522 | m = (unsigned int *)(block - align); |
528 | return sp->private; | 523 | return SLOB_UNITS(*m) * SLOB_UNIT; |
529 | } | 524 | } |
530 | EXPORT_SYMBOL(ksize); | 525 | EXPORT_SYMBOL(ksize); |
531 | 526 | ||
532 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) | 527 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) |
533 | { | 528 | { |
534 | size_t align = c->size; | ||
535 | |||
536 | if (flags & SLAB_DESTROY_BY_RCU) { | 529 | if (flags & SLAB_DESTROY_BY_RCU) { |
537 | /* leave room for rcu footer at the end of object */ | 530 | /* leave room for rcu footer at the end of object */ |
538 | c->size += sizeof(struct slob_rcu); | 531 | c->size += sizeof(struct slob_rcu); |
539 | } | 532 | } |
540 | c->flags = flags; | 533 | c->flags = flags; |
541 | /* ignore alignment unless it's forced */ | ||
542 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | ||
543 | if (c->align < ARCH_SLAB_MINALIGN) | ||
544 | c->align = ARCH_SLAB_MINALIGN; | ||
545 | if (c->align < align) | ||
546 | c->align = align; | ||
547 | |||
548 | return 0; | 534 | return 0; |
549 | } | 535 | } |
550 | 536 | ||
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
558 | 544 | ||
559 | if (c->size < PAGE_SIZE) { | 545 | if (c->size < PAGE_SIZE) { |
560 | b = slob_alloc(c->size, flags, c->align, node); | 546 | b = slob_alloc(c->size, flags, c->align, node); |
561 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 547 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, |
562 | SLOB_UNITS(c->size) * SLOB_UNIT, | 548 | SLOB_UNITS(c->size) * SLOB_UNIT, |
563 | flags, node); | 549 | flags, node); |
564 | } else { | 550 | } else { |
565 | b = slob_new_pages(flags, get_order(c->size), node); | 551 | b = slob_new_pages(flags, get_order(c->size), node); |
566 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 552 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, |
567 | PAGE_SIZE << get_order(c->size), | 553 | PAGE_SIZE << get_order(c->size), |
568 | flags, node); | 554 | flags, node); |
569 | } | 555 | } |
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) | |||
608 | } | 594 | } |
609 | EXPORT_SYMBOL(kmem_cache_free); | 595 | EXPORT_SYMBOL(kmem_cache_free); |
610 | 596 | ||
611 | unsigned int kmem_cache_size(struct kmem_cache *c) | ||
612 | { | ||
613 | return c->size; | ||
614 | } | ||
615 | EXPORT_SYMBOL(kmem_cache_size); | ||
616 | |||
617 | int __kmem_cache_shutdown(struct kmem_cache *c) | 597 | int __kmem_cache_shutdown(struct kmem_cache *c) |
618 | { | 598 | { |
619 | /* No way to check for remaining objects */ | 599 | /* No way to check for remaining objects */ |
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/fault-inject.h> | 31 | #include <linux/fault-inject.h> |
32 | #include <linux/stacktrace.h> | 32 | #include <linux/stacktrace.h> |
33 | #include <linux/prefetch.h> | 33 | #include <linux/prefetch.h> |
34 | #include <linux/memcontrol.h> | ||
34 | 35 | ||
35 | #include <trace/events/kmem.h> | 36 | #include <trace/events/kmem.h> |
36 | 37 | ||
@@ -112,9 +113,6 @@ | |||
112 | * the fast path and disables lockless freelists. | 113 | * the fast path and disables lockless freelists. |
113 | */ | 114 | */ |
114 | 115 | ||
115 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
116 | SLAB_TRACE | SLAB_DEBUG_FREE) | ||
117 | |||
118 | static inline int kmem_cache_debug(struct kmem_cache *s) | 116 | static inline int kmem_cache_debug(struct kmem_cache *s) |
119 | { | 117 | { |
120 | #ifdef CONFIG_SLUB_DEBUG | 118 | #ifdef CONFIG_SLUB_DEBUG |
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
179 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 177 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ |
180 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | 178 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ |
181 | 179 | ||
182 | static int kmem_size = sizeof(struct kmem_cache); | ||
183 | |||
184 | #ifdef CONFIG_SMP | 180 | #ifdef CONFIG_SMP |
185 | static struct notifier_block slab_notifier; | 181 | static struct notifier_block slab_notifier; |
186 | #endif | 182 | #endif |
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; | |||
205 | static int sysfs_slab_add(struct kmem_cache *); | 201 | static int sysfs_slab_add(struct kmem_cache *); |
206 | static int sysfs_slab_alias(struct kmem_cache *, const char *); | 202 | static int sysfs_slab_alias(struct kmem_cache *, const char *); |
207 | static void sysfs_slab_remove(struct kmem_cache *); | 203 | static void sysfs_slab_remove(struct kmem_cache *); |
208 | 204 | static void memcg_propagate_slab_attrs(struct kmem_cache *s); | |
209 | #else | 205 | #else |
210 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } | 206 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } |
211 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) | 207 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) |
212 | { return 0; } | 208 | { return 0; } |
213 | static inline void sysfs_slab_remove(struct kmem_cache *s) { } | 209 | static inline void sysfs_slab_remove(struct kmem_cache *s) { } |
214 | 210 | ||
211 | static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } | ||
215 | #endif | 212 | #endif |
216 | 213 | ||
217 | static inline void stat(const struct kmem_cache *s, enum stat_item si) | 214 | static inline void stat(const struct kmem_cache *s, enum stat_item si) |
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing( | |||
1092 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1089 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1093 | goto out; | 1090 | goto out; |
1094 | 1091 | ||
1095 | if (unlikely(s != page->slab)) { | 1092 | if (unlikely(s != page->slab_cache)) { |
1096 | if (!PageSlab(page)) { | 1093 | if (!PageSlab(page)) { |
1097 | slab_err(s, page, "Attempt to free object(0x%p) " | 1094 | slab_err(s, page, "Attempt to free object(0x%p) " |
1098 | "outside of slab", object); | 1095 | "outside of slab", object); |
1099 | } else if (!page->slab) { | 1096 | } else if (!page->slab_cache) { |
1100 | printk(KERN_ERR | 1097 | printk(KERN_ERR |
1101 | "SLUB <none>: no slab for object 0x%p.\n", | 1098 | "SLUB <none>: no slab for object 0x%p.\n", |
1102 | object); | 1099 | object); |
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1348 | void *start; | 1345 | void *start; |
1349 | void *last; | 1346 | void *last; |
1350 | void *p; | 1347 | void *p; |
1348 | int order; | ||
1351 | 1349 | ||
1352 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 1350 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
1353 | 1351 | ||
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1356 | if (!page) | 1354 | if (!page) |
1357 | goto out; | 1355 | goto out; |
1358 | 1356 | ||
1357 | order = compound_order(page); | ||
1359 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1358 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1360 | page->slab = s; | 1359 | memcg_bind_pages(s, order); |
1360 | page->slab_cache = s; | ||
1361 | __SetPageSlab(page); | 1361 | __SetPageSlab(page); |
1362 | if (page->pfmemalloc) | 1362 | if (page->pfmemalloc) |
1363 | SetPageSlabPfmemalloc(page); | 1363 | SetPageSlabPfmemalloc(page); |
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1365 | start = page_address(page); | 1365 | start = page_address(page); |
1366 | 1366 | ||
1367 | if (unlikely(s->flags & SLAB_POISON)) | 1367 | if (unlikely(s->flags & SLAB_POISON)) |
1368 | memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); | 1368 | memset(start, POISON_INUSE, PAGE_SIZE << order); |
1369 | 1369 | ||
1370 | last = start; | 1370 | last = start; |
1371 | for_each_object(p, s, start, page->objects) { | 1371 | for_each_object(p, s, start, page->objects) { |
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1406 | 1406 | ||
1407 | __ClearPageSlabPfmemalloc(page); | 1407 | __ClearPageSlabPfmemalloc(page); |
1408 | __ClearPageSlab(page); | 1408 | __ClearPageSlab(page); |
1409 | |||
1410 | memcg_release_pages(s, order); | ||
1409 | reset_page_mapcount(page); | 1411 | reset_page_mapcount(page); |
1410 | if (current->reclaim_state) | 1412 | if (current->reclaim_state) |
1411 | current->reclaim_state->reclaimed_slab += pages; | 1413 | current->reclaim_state->reclaimed_slab += pages; |
1412 | __free_pages(page, order); | 1414 | __free_memcg_kmem_pages(page, order); |
1413 | } | 1415 | } |
1414 | 1416 | ||
1415 | #define need_reserve_slab_rcu \ | 1417 | #define need_reserve_slab_rcu \ |
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h) | |||
1424 | else | 1426 | else |
1425 | page = container_of((struct list_head *)h, struct page, lru); | 1427 | page = container_of((struct list_head *)h, struct page, lru); |
1426 | 1428 | ||
1427 | __free_slab(page->slab, page); | 1429 | __free_slab(page->slab_cache, page); |
1428 | } | 1430 | } |
1429 | 1431 | ||
1430 | static void free_slab(struct kmem_cache *s, struct page *page) | 1432 | static void free_slab(struct kmem_cache *s, struct page *page) |
@@ -1872,12 +1874,14 @@ redo: | |||
1872 | /* | 1874 | /* |
1873 | * Unfreeze all the cpu partial slabs. | 1875 | * Unfreeze all the cpu partial slabs. |
1874 | * | 1876 | * |
1875 | * This function must be called with interrupt disabled. | 1877 | * This function must be called with interrupts disabled |
1878 | * for the cpu using c (or some other guarantee must be there | ||
1879 | * to guarantee no concurrent accesses). | ||
1876 | */ | 1880 | */ |
1877 | static void unfreeze_partials(struct kmem_cache *s) | 1881 | static void unfreeze_partials(struct kmem_cache *s, |
1882 | struct kmem_cache_cpu *c) | ||
1878 | { | 1883 | { |
1879 | struct kmem_cache_node *n = NULL, *n2 = NULL; | 1884 | struct kmem_cache_node *n = NULL, *n2 = NULL; |
1880 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); | ||
1881 | struct page *page, *discard_page = NULL; | 1885 | struct page *page, *discard_page = NULL; |
1882 | 1886 | ||
1883 | while ((page = c->partial)) { | 1887 | while ((page = c->partial)) { |
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
1963 | * set to the per node partial list. | 1967 | * set to the per node partial list. |
1964 | */ | 1968 | */ |
1965 | local_irq_save(flags); | 1969 | local_irq_save(flags); |
1966 | unfreeze_partials(s); | 1970 | unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); |
1967 | local_irq_restore(flags); | 1971 | local_irq_restore(flags); |
1968 | oldpage = NULL; | 1972 | oldpage = NULL; |
1969 | pobjects = 0; | 1973 | pobjects = 0; |
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | |||
2006 | if (c->page) | 2010 | if (c->page) |
2007 | flush_slab(s, c); | 2011 | flush_slab(s, c); |
2008 | 2012 | ||
2009 | unfreeze_partials(s); | 2013 | unfreeze_partials(s, c); |
2010 | } | 2014 | } |
2011 | } | 2015 | } |
2012 | 2016 | ||
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, | |||
2325 | if (slab_pre_alloc_hook(s, gfpflags)) | 2329 | if (slab_pre_alloc_hook(s, gfpflags)) |
2326 | return NULL; | 2330 | return NULL; |
2327 | 2331 | ||
2332 | s = memcg_kmem_get_cache(s, gfpflags); | ||
2328 | redo: | 2333 | redo: |
2329 | 2334 | ||
2330 | /* | 2335 | /* |
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2459 | void *prior; | 2464 | void *prior; |
2460 | void **object = (void *)x; | 2465 | void **object = (void *)x; |
2461 | int was_frozen; | 2466 | int was_frozen; |
2462 | int inuse; | ||
2463 | struct page new; | 2467 | struct page new; |
2464 | unsigned long counters; | 2468 | unsigned long counters; |
2465 | struct kmem_cache_node *n = NULL; | 2469 | struct kmem_cache_node *n = NULL; |
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2472 | return; | 2476 | return; |
2473 | 2477 | ||
2474 | do { | 2478 | do { |
2479 | if (unlikely(n)) { | ||
2480 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2481 | n = NULL; | ||
2482 | } | ||
2475 | prior = page->freelist; | 2483 | prior = page->freelist; |
2476 | counters = page->counters; | 2484 | counters = page->counters; |
2477 | set_freepointer(s, object, prior); | 2485 | set_freepointer(s, object, prior); |
2478 | new.counters = counters; | 2486 | new.counters = counters; |
2479 | was_frozen = new.frozen; | 2487 | was_frozen = new.frozen; |
2480 | new.inuse--; | 2488 | new.inuse--; |
2481 | if ((!new.inuse || !prior) && !was_frozen && !n) { | 2489 | if ((!new.inuse || !prior) && !was_frozen) { |
2482 | 2490 | ||
2483 | if (!kmem_cache_debug(s) && !prior) | 2491 | if (!kmem_cache_debug(s) && !prior) |
2484 | 2492 | ||
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2503 | 2511 | ||
2504 | } | 2512 | } |
2505 | } | 2513 | } |
2506 | inuse = new.inuse; | ||
2507 | 2514 | ||
2508 | } while (!cmpxchg_double_slab(s, page, | 2515 | } while (!cmpxchg_double_slab(s, page, |
2509 | prior, counters, | 2516 | prior, counters, |
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2529 | return; | 2536 | return; |
2530 | } | 2537 | } |
2531 | 2538 | ||
2539 | if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) | ||
2540 | goto slab_empty; | ||
2541 | |||
2532 | /* | 2542 | /* |
2533 | * was_frozen may have been set after we acquired the list_lock in | 2543 | * Objects left in the slab. If it was not on the partial list before |
2534 | * an earlier loop. So we need to check it here again. | 2544 | * then add it. |
2535 | */ | 2545 | */ |
2536 | if (was_frozen) | 2546 | if (kmem_cache_debug(s) && unlikely(!prior)) { |
2537 | stat(s, FREE_FROZEN); | 2547 | remove_full(s, page); |
2538 | else { | 2548 | add_partial(n, page, DEACTIVATE_TO_TAIL); |
2539 | if (unlikely(!inuse && n->nr_partial > s->min_partial)) | 2549 | stat(s, FREE_ADD_PARTIAL); |
2540 | goto slab_empty; | ||
2541 | |||
2542 | /* | ||
2543 | * Objects left in the slab. If it was not on the partial list before | ||
2544 | * then add it. | ||
2545 | */ | ||
2546 | if (unlikely(!prior)) { | ||
2547 | remove_full(s, page); | ||
2548 | add_partial(n, page, DEACTIVATE_TO_TAIL); | ||
2549 | stat(s, FREE_ADD_PARTIAL); | ||
2550 | } | ||
2551 | } | 2550 | } |
2552 | spin_unlock_irqrestore(&n->list_lock, flags); | 2551 | spin_unlock_irqrestore(&n->list_lock, flags); |
2553 | return; | 2552 | return; |
@@ -2619,19 +2618,10 @@ redo: | |||
2619 | 2618 | ||
2620 | void kmem_cache_free(struct kmem_cache *s, void *x) | 2619 | void kmem_cache_free(struct kmem_cache *s, void *x) |
2621 | { | 2620 | { |
2622 | struct page *page; | 2621 | s = cache_from_obj(s, x); |
2623 | 2622 | if (!s) | |
2624 | page = virt_to_head_page(x); | ||
2625 | |||
2626 | if (kmem_cache_debug(s) && page->slab != s) { | ||
2627 | pr_err("kmem_cache_free: Wrong slab cache. %s but object" | ||
2628 | " is from %s\n", page->slab->name, s->name); | ||
2629 | WARN_ON_ONCE(1); | ||
2630 | return; | 2623 | return; |
2631 | } | 2624 | slab_free(s, virt_to_head_page(x), x, _RET_IP_); |
2632 | |||
2633 | slab_free(s, page, x, _RET_IP_); | ||
2634 | |||
2635 | trace_kmem_cache_free(_RET_IP_, x); | 2625 | trace_kmem_cache_free(_RET_IP_, x); |
2636 | } | 2626 | } |
2637 | EXPORT_SYMBOL(kmem_cache_free); | 2627 | EXPORT_SYMBOL(kmem_cache_free); |
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved) | |||
2769 | return -ENOSYS; | 2759 | return -ENOSYS; |
2770 | } | 2760 | } |
2771 | 2761 | ||
2772 | /* | ||
2773 | * Figure out what the alignment of the objects will be. | ||
2774 | */ | ||
2775 | static unsigned long calculate_alignment(unsigned long flags, | ||
2776 | unsigned long align, unsigned long size) | ||
2777 | { | ||
2778 | /* | ||
2779 | * If the user wants hardware cache aligned objects then follow that | ||
2780 | * suggestion if the object is sufficiently large. | ||
2781 | * | ||
2782 | * The hardware cache alignment cannot override the specified | ||
2783 | * alignment though. If that is greater then use it. | ||
2784 | */ | ||
2785 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
2786 | unsigned long ralign = cache_line_size(); | ||
2787 | while (size <= ralign / 2) | ||
2788 | ralign /= 2; | ||
2789 | align = max(align, ralign); | ||
2790 | } | ||
2791 | |||
2792 | if (align < ARCH_SLAB_MINALIGN) | ||
2793 | align = ARCH_SLAB_MINALIGN; | ||
2794 | |||
2795 | return ALIGN(align, sizeof(void *)); | ||
2796 | } | ||
2797 | |||
2798 | static void | 2762 | static void |
2799 | init_kmem_cache_node(struct kmem_cache_node *n) | 2763 | init_kmem_cache_node(struct kmem_cache_node *n) |
2800 | { | 2764 | { |
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2928 | { | 2892 | { |
2929 | unsigned long flags = s->flags; | 2893 | unsigned long flags = s->flags; |
2930 | unsigned long size = s->object_size; | 2894 | unsigned long size = s->object_size; |
2931 | unsigned long align = s->align; | ||
2932 | int order; | 2895 | int order; |
2933 | 2896 | ||
2934 | /* | 2897 | /* |
@@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3000 | #endif | 2963 | #endif |
3001 | 2964 | ||
3002 | /* | 2965 | /* |
3003 | * Determine the alignment based on various parameters that the | ||
3004 | * user specified and the dynamic determination of cache line size | ||
3005 | * on bootup. | ||
3006 | */ | ||
3007 | align = calculate_alignment(flags, align, s->object_size); | ||
3008 | s->align = align; | ||
3009 | |||
3010 | /* | ||
3011 | * SLUB stores one object immediately after another beginning from | 2966 | * SLUB stores one object immediately after another beginning from |
3012 | * offset 0. In order to align the objects we have to simply size | 2967 | * offset 0. In order to align the objects we have to simply size |
3013 | * each object to conform to the alignment. | 2968 | * each object to conform to the alignment. |
3014 | */ | 2969 | */ |
3015 | size = ALIGN(size, align); | 2970 | size = ALIGN(size, s->align); |
3016 | s->size = size; | 2971 | s->size = size; |
3017 | if (forced_order >= 0) | 2972 | if (forced_order >= 0) |
3018 | order = forced_order; | 2973 | order = forced_order; |
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3041 | s->max = s->oo; | 2996 | s->max = s->oo; |
3042 | 2997 | ||
3043 | return !!oo_objects(s->oo); | 2998 | return !!oo_objects(s->oo); |
3044 | |||
3045 | } | 2999 | } |
3046 | 3000 | ||
3047 | static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | 3001 | static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) |
@@ -3127,15 +3081,6 @@ error: | |||
3127 | return -EINVAL; | 3081 | return -EINVAL; |
3128 | } | 3082 | } |
3129 | 3083 | ||
3130 | /* | ||
3131 | * Determine the size of a slab object | ||
3132 | */ | ||
3133 | unsigned int kmem_cache_size(struct kmem_cache *s) | ||
3134 | { | ||
3135 | return s->object_size; | ||
3136 | } | ||
3137 | EXPORT_SYMBOL(kmem_cache_size); | ||
3138 | |||
3139 | static void list_slab_objects(struct kmem_cache *s, struct page *page, | 3084 | static void list_slab_objects(struct kmem_cache *s, struct page *page, |
3140 | const char *text) | 3085 | const char *text) |
3141 | { | 3086 | { |
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s) | |||
3208 | { | 3153 | { |
3209 | int rc = kmem_cache_close(s); | 3154 | int rc = kmem_cache_close(s); |
3210 | 3155 | ||
3211 | if (!rc) | 3156 | if (!rc) { |
3157 | /* | ||
3158 | * We do the same lock strategy around sysfs_slab_add, see | ||
3159 | * __kmem_cache_create. Because this is pretty much the last | ||
3160 | * operation we do and the lock will be released shortly after | ||
3161 | * that in slab_common.c, we could just move sysfs_slab_remove | ||
3162 | * to a later point in common code. We should do that when we | ||
3163 | * have a common sysfs framework for all allocators. | ||
3164 | */ | ||
3165 | mutex_unlock(&slab_mutex); | ||
3212 | sysfs_slab_remove(s); | 3166 | sysfs_slab_remove(s); |
3167 | mutex_lock(&slab_mutex); | ||
3168 | } | ||
3213 | 3169 | ||
3214 | return rc; | 3170 | return rc; |
3215 | } | 3171 | } |
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str) | |||
3261 | 3217 | ||
3262 | __setup("slub_nomerge", setup_slub_nomerge); | 3218 | __setup("slub_nomerge", setup_slub_nomerge); |
3263 | 3219 | ||
3264 | static struct kmem_cache *__init create_kmalloc_cache(const char *name, | ||
3265 | int size, unsigned int flags) | ||
3266 | { | ||
3267 | struct kmem_cache *s; | ||
3268 | |||
3269 | s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
3270 | |||
3271 | s->name = name; | ||
3272 | s->size = s->object_size = size; | ||
3273 | s->align = ARCH_KMALLOC_MINALIGN; | ||
3274 | |||
3275 | /* | ||
3276 | * This function is called with IRQs disabled during early-boot on | ||
3277 | * single CPU so there's no need to take slab_mutex here. | ||
3278 | */ | ||
3279 | if (kmem_cache_open(s, flags)) | ||
3280 | goto panic; | ||
3281 | |||
3282 | list_add(&s->list, &slab_caches); | ||
3283 | return s; | ||
3284 | |||
3285 | panic: | ||
3286 | panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); | ||
3287 | return NULL; | ||
3288 | } | ||
3289 | |||
3290 | /* | 3220 | /* |
3291 | * Conversion table for small slabs sizes / 8 to the index in the | 3221 | * Conversion table for small slabs sizes / 8 to the index in the |
3292 | * kmalloc array. This is necessary for slabs < 192 since we have non power | 3222 | * kmalloc array. This is necessary for slabs < 192 since we have non power |
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |||
3372 | struct page *page; | 3302 | struct page *page; |
3373 | void *ptr = NULL; | 3303 | void *ptr = NULL; |
3374 | 3304 | ||
3375 | flags |= __GFP_COMP | __GFP_NOTRACK; | 3305 | flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; |
3376 | page = alloc_pages_node(node, flags, get_order(size)); | 3306 | page = alloc_pages_node(node, flags, get_order(size)); |
3377 | if (page) | 3307 | if (page) |
3378 | ptr = page_address(page); | 3308 | ptr = page_address(page); |
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object) | |||
3424 | return PAGE_SIZE << compound_order(page); | 3354 | return PAGE_SIZE << compound_order(page); |
3425 | } | 3355 | } |
3426 | 3356 | ||
3427 | return slab_ksize(page->slab); | 3357 | return slab_ksize(page->slab_cache); |
3428 | } | 3358 | } |
3429 | EXPORT_SYMBOL(ksize); | 3359 | EXPORT_SYMBOL(ksize); |
3430 | 3360 | ||
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x) | |||
3449 | } | 3379 | } |
3450 | 3380 | ||
3451 | slab_lock(page); | 3381 | slab_lock(page); |
3452 | if (on_freelist(page->slab, page, object)) { | 3382 | if (on_freelist(page->slab_cache, page, object)) { |
3453 | object_err(page->slab, page, object, "Object is on free-list"); | 3383 | object_err(page->slab_cache, page, object, "Object is on free-list"); |
3454 | rv = false; | 3384 | rv = false; |
3455 | } else { | 3385 | } else { |
3456 | rv = true; | 3386 | rv = true; |
@@ -3478,10 +3408,10 @@ void kfree(const void *x) | |||
3478 | if (unlikely(!PageSlab(page))) { | 3408 | if (unlikely(!PageSlab(page))) { |
3479 | BUG_ON(!PageCompound(page)); | 3409 | BUG_ON(!PageCompound(page)); |
3480 | kmemleak_free(x); | 3410 | kmemleak_free(x); |
3481 | __free_pages(page, compound_order(page)); | 3411 | __free_memcg_kmem_pages(page, compound_order(page)); |
3482 | return; | 3412 | return; |
3483 | } | 3413 | } |
3484 | slab_free(page->slab, page, object, _RET_IP_); | 3414 | slab_free(page->slab_cache, page, object, _RET_IP_); |
3485 | } | 3415 | } |
3486 | EXPORT_SYMBOL(kfree); | 3416 | EXPORT_SYMBOL(kfree); |
3487 | 3417 | ||
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3573 | struct memory_notify *marg = arg; | 3503 | struct memory_notify *marg = arg; |
3574 | int offline_node; | 3504 | int offline_node; |
3575 | 3505 | ||
3576 | offline_node = marg->status_change_nid; | 3506 | offline_node = marg->status_change_nid_normal; |
3577 | 3507 | ||
3578 | /* | 3508 | /* |
3579 | * If the node still has available memory. we need kmem_cache_node | 3509 | * If the node still has available memory. we need kmem_cache_node |
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
3606 | struct kmem_cache_node *n; | 3536 | struct kmem_cache_node *n; |
3607 | struct kmem_cache *s; | 3537 | struct kmem_cache *s; |
3608 | struct memory_notify *marg = arg; | 3538 | struct memory_notify *marg = arg; |
3609 | int nid = marg->status_change_nid; | 3539 | int nid = marg->status_change_nid_normal; |
3610 | int ret = 0; | 3540 | int ret = 0; |
3611 | 3541 | ||
3612 | /* | 3542 | /* |
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self, | |||
3676 | 3606 | ||
3677 | /* | 3607 | /* |
3678 | * Used for early kmem_cache structures that were allocated using | 3608 | * Used for early kmem_cache structures that were allocated using |
3679 | * the page allocator | 3609 | * the page allocator. Allocate them properly then fix up the pointers |
3610 | * that may be pointing to the wrong kmem_cache structure. | ||
3680 | */ | 3611 | */ |
3681 | 3612 | ||
3682 | static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) | 3613 | static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) |
3683 | { | 3614 | { |
3684 | int node; | 3615 | int node; |
3616 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
3685 | 3617 | ||
3686 | list_add(&s->list, &slab_caches); | 3618 | memcpy(s, static_cache, kmem_cache->object_size); |
3687 | s->refcount = -1; | ||
3688 | 3619 | ||
3689 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3620 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3690 | struct kmem_cache_node *n = get_node(s, node); | 3621 | struct kmem_cache_node *n = get_node(s, node); |
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) | |||
3692 | 3623 | ||
3693 | if (n) { | 3624 | if (n) { |
3694 | list_for_each_entry(p, &n->partial, lru) | 3625 | list_for_each_entry(p, &n->partial, lru) |
3695 | p->slab = s; | 3626 | p->slab_cache = s; |
3696 | 3627 | ||
3697 | #ifdef CONFIG_SLUB_DEBUG | 3628 | #ifdef CONFIG_SLUB_DEBUG |
3698 | list_for_each_entry(p, &n->full, lru) | 3629 | list_for_each_entry(p, &n->full, lru) |
3699 | p->slab = s; | 3630 | p->slab_cache = s; |
3700 | #endif | 3631 | #endif |
3701 | } | 3632 | } |
3702 | } | 3633 | } |
3634 | list_add(&s->list, &slab_caches); | ||
3635 | return s; | ||
3703 | } | 3636 | } |
3704 | 3637 | ||
3705 | void __init kmem_cache_init(void) | 3638 | void __init kmem_cache_init(void) |
3706 | { | 3639 | { |
3640 | static __initdata struct kmem_cache boot_kmem_cache, | ||
3641 | boot_kmem_cache_node; | ||
3707 | int i; | 3642 | int i; |
3708 | int caches = 0; | 3643 | int caches = 2; |
3709 | struct kmem_cache *temp_kmem_cache; | ||
3710 | int order; | ||
3711 | struct kmem_cache *temp_kmem_cache_node; | ||
3712 | unsigned long kmalloc_size; | ||
3713 | 3644 | ||
3714 | if (debug_guardpage_minorder()) | 3645 | if (debug_guardpage_minorder()) |
3715 | slub_max_order = 0; | 3646 | slub_max_order = 0; |
3716 | 3647 | ||
3717 | kmem_size = offsetof(struct kmem_cache, node) + | 3648 | kmem_cache_node = &boot_kmem_cache_node; |
3718 | nr_node_ids * sizeof(struct kmem_cache_node *); | 3649 | kmem_cache = &boot_kmem_cache; |
3719 | |||
3720 | /* Allocate two kmem_caches from the page allocator */ | ||
3721 | kmalloc_size = ALIGN(kmem_size, cache_line_size()); | ||
3722 | order = get_order(2 * kmalloc_size); | ||
3723 | kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); | ||
3724 | |||
3725 | /* | ||
3726 | * Must first have the slab cache available for the allocations of the | ||
3727 | * struct kmem_cache_node's. There is special bootstrap code in | ||
3728 | * kmem_cache_open for slab_state == DOWN. | ||
3729 | */ | ||
3730 | kmem_cache_node = (void *)kmem_cache + kmalloc_size; | ||
3731 | 3650 | ||
3732 | kmem_cache_node->name = "kmem_cache_node"; | 3651 | create_boot_cache(kmem_cache_node, "kmem_cache_node", |
3733 | kmem_cache_node->size = kmem_cache_node->object_size = | 3652 | sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); |
3734 | sizeof(struct kmem_cache_node); | ||
3735 | kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); | ||
3736 | 3653 | ||
3737 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); | 3654 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); |
3738 | 3655 | ||
3739 | /* Able to allocate the per node structures */ | 3656 | /* Able to allocate the per node structures */ |
3740 | slab_state = PARTIAL; | 3657 | slab_state = PARTIAL; |
3741 | 3658 | ||
3742 | temp_kmem_cache = kmem_cache; | 3659 | create_boot_cache(kmem_cache, "kmem_cache", |
3743 | kmem_cache->name = "kmem_cache"; | 3660 | offsetof(struct kmem_cache, node) + |
3744 | kmem_cache->size = kmem_cache->object_size = kmem_size; | 3661 | nr_node_ids * sizeof(struct kmem_cache_node *), |
3745 | kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 3662 | SLAB_HWCACHE_ALIGN); |
3746 | 3663 | ||
3747 | kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); | 3664 | kmem_cache = bootstrap(&boot_kmem_cache); |
3748 | memcpy(kmem_cache, temp_kmem_cache, kmem_size); | ||
3749 | 3665 | ||
3750 | /* | 3666 | /* |
3751 | * Allocate kmem_cache_node properly from the kmem_cache slab. | 3667 | * Allocate kmem_cache_node properly from the kmem_cache slab. |
3752 | * kmem_cache_node is separately allocated so no need to | 3668 | * kmem_cache_node is separately allocated so no need to |
3753 | * update any list pointers. | 3669 | * update any list pointers. |
3754 | */ | 3670 | */ |
3755 | temp_kmem_cache_node = kmem_cache_node; | 3671 | kmem_cache_node = bootstrap(&boot_kmem_cache_node); |
3756 | |||
3757 | kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); | ||
3758 | memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); | ||
3759 | |||
3760 | kmem_cache_bootstrap_fixup(kmem_cache_node); | ||
3761 | |||
3762 | caches++; | ||
3763 | kmem_cache_bootstrap_fixup(kmem_cache); | ||
3764 | caches++; | ||
3765 | /* Free temporary boot structure */ | ||
3766 | free_pages((unsigned long)temp_kmem_cache, order); | ||
3767 | 3672 | ||
3768 | /* Now we can use the kmem_cache to allocate kmalloc slabs */ | 3673 | /* Now we can use the kmem_cache to allocate kmalloc slabs */ |
3769 | 3674 | ||
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
3891 | return 0; | 3796 | return 0; |
3892 | } | 3797 | } |
3893 | 3798 | ||
3894 | static struct kmem_cache *find_mergeable(size_t size, | 3799 | static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, |
3895 | size_t align, unsigned long flags, const char *name, | 3800 | size_t align, unsigned long flags, const char *name, |
3896 | void (*ctor)(void *)) | 3801 | void (*ctor)(void *)) |
3897 | { | 3802 | { |
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
3927 | if (s->size - size >= sizeof(void *)) | 3832 | if (s->size - size >= sizeof(void *)) |
3928 | continue; | 3833 | continue; |
3929 | 3834 | ||
3835 | if (!cache_match_memcg(s, memcg)) | ||
3836 | continue; | ||
3837 | |||
3930 | return s; | 3838 | return s; |
3931 | } | 3839 | } |
3932 | return NULL; | 3840 | return NULL; |
3933 | } | 3841 | } |
3934 | 3842 | ||
3935 | struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | 3843 | struct kmem_cache * |
3936 | size_t align, unsigned long flags, void (*ctor)(void *)) | 3844 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, |
3845 | size_t align, unsigned long flags, void (*ctor)(void *)) | ||
3937 | { | 3846 | { |
3938 | struct kmem_cache *s; | 3847 | struct kmem_cache *s; |
3939 | 3848 | ||
3940 | s = find_mergeable(size, align, flags, name, ctor); | 3849 | s = find_mergeable(memcg, size, align, flags, name, ctor); |
3941 | if (s) { | 3850 | if (s) { |
3942 | s->refcount++; | 3851 | s->refcount++; |
3943 | /* | 3852 | /* |
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) | |||
3964 | if (err) | 3873 | if (err) |
3965 | return err; | 3874 | return err; |
3966 | 3875 | ||
3876 | /* Mutex is not taken during early boot */ | ||
3877 | if (slab_state <= UP) | ||
3878 | return 0; | ||
3879 | |||
3880 | memcg_propagate_slab_attrs(s); | ||
3967 | mutex_unlock(&slab_mutex); | 3881 | mutex_unlock(&slab_mutex); |
3968 | err = sysfs_slab_add(s); | 3882 | err = sysfs_slab_add(s); |
3969 | mutex_lock(&slab_mutex); | 3883 | mutex_lock(&slab_mutex); |
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
5197 | return -EIO; | 5111 | return -EIO; |
5198 | 5112 | ||
5199 | err = attribute->store(s, buf, len); | 5113 | err = attribute->store(s, buf, len); |
5114 | #ifdef CONFIG_MEMCG_KMEM | ||
5115 | if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { | ||
5116 | int i; | ||
5117 | |||
5118 | mutex_lock(&slab_mutex); | ||
5119 | if (s->max_attr_size < len) | ||
5120 | s->max_attr_size = len; | ||
5200 | 5121 | ||
5122 | /* | ||
5123 | * This is a best effort propagation, so this function's return | ||
5124 | * value will be determined by the parent cache only. This is | ||
5125 | * basically because not all attributes will have a well | ||
5126 | * defined semantics for rollbacks - most of the actions will | ||
5127 | * have permanent effects. | ||
5128 | * | ||
5129 | * Returning the error value of any of the children that fail | ||
5130 | * is not 100 % defined, in the sense that users seeing the | ||
5131 | * error code won't be able to know anything about the state of | ||
5132 | * the cache. | ||
5133 | * | ||
5134 | * Only returning the error code for the parent cache at least | ||
5135 | * has well defined semantics. The cache being written to | ||
5136 | * directly either failed or succeeded, in which case we loop | ||
5137 | * through the descendants with best-effort propagation. | ||
5138 | */ | ||
5139 | for_each_memcg_cache_index(i) { | ||
5140 | struct kmem_cache *c = cache_from_memcg(s, i); | ||
5141 | if (c) | ||
5142 | attribute->store(c, buf, len); | ||
5143 | } | ||
5144 | mutex_unlock(&slab_mutex); | ||
5145 | } | ||
5146 | #endif | ||
5201 | return err; | 5147 | return err; |
5202 | } | 5148 | } |
5203 | 5149 | ||
5150 | static void memcg_propagate_slab_attrs(struct kmem_cache *s) | ||
5151 | { | ||
5152 | #ifdef CONFIG_MEMCG_KMEM | ||
5153 | int i; | ||
5154 | char *buffer = NULL; | ||
5155 | |||
5156 | if (!is_root_cache(s)) | ||
5157 | return; | ||
5158 | |||
5159 | /* | ||
5160 | * This mean this cache had no attribute written. Therefore, no point | ||
5161 | * in copying default values around | ||
5162 | */ | ||
5163 | if (!s->max_attr_size) | ||
5164 | return; | ||
5165 | |||
5166 | for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { | ||
5167 | char mbuf[64]; | ||
5168 | char *buf; | ||
5169 | struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); | ||
5170 | |||
5171 | if (!attr || !attr->store || !attr->show) | ||
5172 | continue; | ||
5173 | |||
5174 | /* | ||
5175 | * It is really bad that we have to allocate here, so we will | ||
5176 | * do it only as a fallback. If we actually allocate, though, | ||
5177 | * we can just use the allocated buffer until the end. | ||
5178 | * | ||
5179 | * Most of the slub attributes will tend to be very small in | ||
5180 | * size, but sysfs allows buffers up to a page, so they can | ||
5181 | * theoretically happen. | ||
5182 | */ | ||
5183 | if (buffer) | ||
5184 | buf = buffer; | ||
5185 | else if (s->max_attr_size < ARRAY_SIZE(mbuf)) | ||
5186 | buf = mbuf; | ||
5187 | else { | ||
5188 | buffer = (char *) get_zeroed_page(GFP_KERNEL); | ||
5189 | if (WARN_ON(!buffer)) | ||
5190 | continue; | ||
5191 | buf = buffer; | ||
5192 | } | ||
5193 | |||
5194 | attr->show(s->memcg_params->root_cache, buf); | ||
5195 | attr->store(s, buf, strlen(buf)); | ||
5196 | } | ||
5197 | |||
5198 | if (buffer) | ||
5199 | free_page((unsigned long)buffer); | ||
5200 | #endif | ||
5201 | } | ||
5202 | |||
5204 | static const struct sysfs_ops slab_sysfs_ops = { | 5203 | static const struct sysfs_ops slab_sysfs_ops = { |
5205 | .show = slab_attr_show, | 5204 | .show = slab_attr_show, |
5206 | .store = slab_attr_store, | 5205 | .store = slab_attr_store, |
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s) | |||
5257 | if (p != name + 1) | 5256 | if (p != name + 1) |
5258 | *p++ = '-'; | 5257 | *p++ = '-'; |
5259 | p += sprintf(p, "%07d", s->size); | 5258 | p += sprintf(p, "%07d", s->size); |
5259 | |||
5260 | #ifdef CONFIG_MEMCG_KMEM | ||
5261 | if (!is_root_cache(s)) | ||
5262 | p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); | ||
5263 | #endif | ||
5264 | |||
5260 | BUG_ON(p > name + ID_STR_LENGTH - 1); | 5265 | BUG_ON(p > name + ID_STR_LENGTH - 1); |
5261 | return name; | 5266 | return name; |
5262 | } | 5267 | } |
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5265 | { | 5270 | { |
5266 | int err; | 5271 | int err; |
5267 | const char *name; | 5272 | const char *name; |
5268 | int unmergeable; | 5273 | int unmergeable = slab_unmergeable(s); |
5269 | |||
5270 | if (slab_state < FULL) | ||
5271 | /* Defer until later */ | ||
5272 | return 0; | ||
5273 | 5274 | ||
5274 | unmergeable = slab_unmergeable(s); | ||
5275 | if (unmergeable) { | 5275 | if (unmergeable) { |
5276 | /* | 5276 | /* |
5277 | * Slabcache can never be merged so we can use the name proper. | 5277 | * Slabcache can never be merged so we can use the name proper. |
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init); | |||
5405 | * The /proc/slabinfo ABI | 5405 | * The /proc/slabinfo ABI |
5406 | */ | 5406 | */ |
5407 | #ifdef CONFIG_SLABINFO | 5407 | #ifdef CONFIG_SLABINFO |
5408 | static void print_slabinfo_header(struct seq_file *m) | 5408 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) |
5409 | { | ||
5410 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
5411 | seq_puts(m, "# name <active_objs> <num_objs> <object_size> " | ||
5412 | "<objperslab> <pagesperslab>"); | ||
5413 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
5414 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
5415 | seq_putc(m, '\n'); | ||
5416 | } | ||
5417 | |||
5418 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
5419 | { | ||
5420 | loff_t n = *pos; | ||
5421 | |||
5422 | mutex_lock(&slab_mutex); | ||
5423 | if (!n) | ||
5424 | print_slabinfo_header(m); | ||
5425 | |||
5426 | return seq_list_start(&slab_caches, *pos); | ||
5427 | } | ||
5428 | |||
5429 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
5430 | { | ||
5431 | return seq_list_next(p, &slab_caches, pos); | ||
5432 | } | ||
5433 | |||
5434 | static void s_stop(struct seq_file *m, void *p) | ||
5435 | { | ||
5436 | mutex_unlock(&slab_mutex); | ||
5437 | } | ||
5438 | |||
5439 | static int s_show(struct seq_file *m, void *p) | ||
5440 | { | 5409 | { |
5441 | unsigned long nr_partials = 0; | 5410 | unsigned long nr_partials = 0; |
5442 | unsigned long nr_slabs = 0; | 5411 | unsigned long nr_slabs = 0; |
5443 | unsigned long nr_inuse = 0; | ||
5444 | unsigned long nr_objs = 0; | 5412 | unsigned long nr_objs = 0; |
5445 | unsigned long nr_free = 0; | 5413 | unsigned long nr_free = 0; |
5446 | struct kmem_cache *s; | ||
5447 | int node; | 5414 | int node; |
5448 | 5415 | ||
5449 | s = list_entry(p, struct kmem_cache, list); | ||
5450 | |||
5451 | for_each_online_node(node) { | 5416 | for_each_online_node(node) { |
5452 | struct kmem_cache_node *n = get_node(s, node); | 5417 | struct kmem_cache_node *n = get_node(s, node); |
5453 | 5418 | ||
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p) | |||
5460 | nr_free += count_partial(n, count_free); | 5425 | nr_free += count_partial(n, count_free); |
5461 | } | 5426 | } |
5462 | 5427 | ||
5463 | nr_inuse = nr_objs - nr_free; | 5428 | sinfo->active_objs = nr_objs - nr_free; |
5464 | 5429 | sinfo->num_objs = nr_objs; | |
5465 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, | 5430 | sinfo->active_slabs = nr_slabs; |
5466 | nr_objs, s->size, oo_objects(s->oo), | 5431 | sinfo->num_slabs = nr_slabs; |
5467 | (1 << oo_order(s->oo))); | 5432 | sinfo->objects_per_slab = oo_objects(s->oo); |
5468 | seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); | 5433 | sinfo->cache_order = oo_order(s->oo); |
5469 | seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, | ||
5470 | 0UL); | ||
5471 | seq_putc(m, '\n'); | ||
5472 | return 0; | ||
5473 | } | 5434 | } |
5474 | 5435 | ||
5475 | static const struct seq_operations slabinfo_op = { | 5436 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) |
5476 | .start = s_start, | ||
5477 | .next = s_next, | ||
5478 | .stop = s_stop, | ||
5479 | .show = s_show, | ||
5480 | }; | ||
5481 | |||
5482 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
5483 | { | 5437 | { |
5484 | return seq_open(file, &slabinfo_op); | ||
5485 | } | 5438 | } |
5486 | 5439 | ||
5487 | static const struct file_operations proc_slabinfo_operations = { | 5440 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
5488 | .open = slabinfo_open, | 5441 | size_t count, loff_t *ppos) |
5489 | .read = seq_read, | ||
5490 | .llseek = seq_lseek, | ||
5491 | .release = seq_release, | ||
5492 | }; | ||
5493 | |||
5494 | static int __init slab_proc_init(void) | ||
5495 | { | 5442 | { |
5496 | proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); | 5443 | return -EIO; |
5497 | return 0; | ||
5498 | } | 5444 | } |
5499 | module_init(slab_proc_init); | ||
5500 | #endif /* CONFIG_SLABINFO */ | 5445 | #endif /* CONFIG_SLABINFO */ |
diff --git a/mm/sparse.c b/mm/sparse.c index fac95f2888f2..6b5fb762e2ca 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
617 | { | 617 | { |
618 | return; /* XXX: Not implemented yet */ | 618 | return; /* XXX: Not implemented yet */ |
619 | } | 619 | } |
620 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) | 620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) |
621 | { | 621 | { |
622 | } | 622 | } |
623 | #else | 623 | #else |
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | |||
638 | got_map_page: | 638 | got_map_page: |
639 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); | 639 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); |
640 | got_map_ptr: | 640 | got_map_ptr: |
641 | memset(ret, 0, memmap_size); | ||
642 | 641 | ||
643 | return ret; | 642 | return ret; |
644 | } | 643 | } |
@@ -658,10 +657,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
658 | get_order(sizeof(struct page) * nr_pages)); | 657 | get_order(sizeof(struct page) * nr_pages)); |
659 | } | 658 | } |
660 | 659 | ||
661 | static void free_map_bootmem(struct page *page, unsigned long nr_pages) | 660 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) |
662 | { | 661 | { |
663 | unsigned long maps_section_nr, removing_section_nr, i; | 662 | unsigned long maps_section_nr, removing_section_nr, i; |
664 | unsigned long magic; | 663 | unsigned long magic; |
664 | struct page *page = virt_to_page(memmap); | ||
665 | 665 | ||
666 | for (i = 0; i < nr_pages; i++, page++) { | 666 | for (i = 0; i < nr_pages; i++, page++) { |
667 | magic = (unsigned long) page->lru.next; | 667 | magic = (unsigned long) page->lru.next; |
@@ -710,13 +710,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
710 | */ | 710 | */ |
711 | 711 | ||
712 | if (memmap) { | 712 | if (memmap) { |
713 | struct page *memmap_page; | ||
714 | memmap_page = virt_to_page(memmap); | ||
715 | |||
716 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) | 713 | nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) |
717 | >> PAGE_SHIFT; | 714 | >> PAGE_SHIFT; |
718 | 715 | ||
719 | free_map_bootmem(memmap_page, nr_pages); | 716 | free_map_bootmem(memmap, nr_pages); |
720 | } | 717 | } |
721 | } | 718 | } |
722 | 719 | ||
@@ -760,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
760 | goto out; | 757 | goto out; |
761 | } | 758 | } |
762 | 759 | ||
760 | memset(memmap, 0, sizeof(struct page) * nr_pages); | ||
761 | |||
763 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 762 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
764 | 763 | ||
765 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); | 764 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); |
@@ -773,6 +772,27 @@ out: | |||
773 | return ret; | 772 | return ret; |
774 | } | 773 | } |
775 | 774 | ||
775 | #ifdef CONFIG_MEMORY_FAILURE | ||
776 | static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | ||
777 | { | ||
778 | int i; | ||
779 | |||
780 | if (!memmap) | ||
781 | return; | ||
782 | |||
783 | for (i = 0; i < PAGES_PER_SECTION; i++) { | ||
784 | if (PageHWPoison(&memmap[i])) { | ||
785 | atomic_long_sub(1, &mce_bad_pages); | ||
786 | ClearPageHWPoison(&memmap[i]); | ||
787 | } | ||
788 | } | ||
789 | } | ||
790 | #else | ||
791 | static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | ||
792 | { | ||
793 | } | ||
794 | #endif | ||
795 | |||
776 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 796 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
777 | { | 797 | { |
778 | struct page *memmap = NULL; | 798 | struct page *memmap = NULL; |
@@ -786,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
786 | ms->pageblock_flags = NULL; | 806 | ms->pageblock_flags = NULL; |
787 | } | 807 | } |
788 | 808 | ||
809 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | ||
789 | free_section_usemap(memmap, usemap); | 810 | free_section_usemap(memmap, usemap); |
790 | } | 811 | } |
791 | #endif | 812 | #endif |
diff --git a/mm/swapfile.c b/mm/swapfile.c index f91a25547ffe..e97a0e5aea91 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1443 | return generic_swapfile_activate(sis, swap_file, span); | 1443 | return generic_swapfile_activate(sis, swap_file, span); |
1444 | } | 1444 | } |
1445 | 1445 | ||
1446 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1446 | static void _enable_swap_info(struct swap_info_struct *p, int prio, |
1447 | unsigned char *swap_map, | 1447 | unsigned char *swap_map, |
1448 | unsigned long *frontswap_map) | 1448 | unsigned long *frontswap_map) |
1449 | { | 1449 | { |
1450 | int i, prev; | 1450 | int i, prev; |
1451 | 1451 | ||
1452 | spin_lock(&swap_lock); | ||
1453 | if (prio >= 0) | 1452 | if (prio >= 0) |
1454 | p->prio = prio; | 1453 | p->prio = prio; |
1455 | else | 1454 | else |
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1472 | swap_list.head = swap_list.next = p->type; | 1471 | swap_list.head = swap_list.next = p->type; |
1473 | else | 1472 | else |
1474 | swap_info[prev]->next = p->type; | 1473 | swap_info[prev]->next = p->type; |
1474 | } | ||
1475 | |||
1476 | static void enable_swap_info(struct swap_info_struct *p, int prio, | ||
1477 | unsigned char *swap_map, | ||
1478 | unsigned long *frontswap_map) | ||
1479 | { | ||
1480 | spin_lock(&swap_lock); | ||
1481 | _enable_swap_info(p, prio, swap_map, frontswap_map); | ||
1475 | frontswap_init(p->type); | 1482 | frontswap_init(p->type); |
1476 | spin_unlock(&swap_lock); | 1483 | spin_unlock(&swap_lock); |
1477 | } | 1484 | } |
1478 | 1485 | ||
1486 | static void reinsert_swap_info(struct swap_info_struct *p) | ||
1487 | { | ||
1488 | spin_lock(&swap_lock); | ||
1489 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | ||
1490 | spin_unlock(&swap_lock); | ||
1491 | } | ||
1492 | |||
1479 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1493 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1480 | { | 1494 | { |
1481 | struct swap_info_struct *p = NULL; | 1495 | struct swap_info_struct *p = NULL; |
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1484 | struct address_space *mapping; | 1498 | struct address_space *mapping; |
1485 | struct inode *inode; | 1499 | struct inode *inode; |
1486 | struct filename *pathname; | 1500 | struct filename *pathname; |
1487 | int oom_score_adj; | ||
1488 | int i, type, prev; | 1501 | int i, type, prev; |
1489 | int err; | 1502 | int err; |
1490 | 1503 | ||
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1543 | p->flags &= ~SWP_WRITEOK; | 1556 | p->flags &= ~SWP_WRITEOK; |
1544 | spin_unlock(&swap_lock); | 1557 | spin_unlock(&swap_lock); |
1545 | 1558 | ||
1546 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1559 | set_current_oom_origin(); |
1547 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | 1560 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
1548 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1561 | clear_current_oom_origin(); |
1549 | 1562 | ||
1550 | if (err) { | 1563 | if (err) { |
1551 | /* | ||
1552 | * reading p->prio and p->swap_map outside the lock is | ||
1553 | * safe here because only sys_swapon and sys_swapoff | ||
1554 | * change them, and there can be no other sys_swapon or | ||
1555 | * sys_swapoff for this swap_info_struct at this point. | ||
1556 | */ | ||
1557 | /* re-insert swap space back into swap_list */ | 1564 | /* re-insert swap space back into swap_list */ |
1558 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | 1565 | reinsert_swap_info(p); |
1559 | goto out_dput; | 1566 | goto out_dput; |
1560 | } | 1567 | } |
1561 | 1568 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index d51ce92d6e83..c75b736e54b7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -577,29 +577,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize) | |||
577 | EXPORT_SYMBOL(truncate_setsize); | 577 | EXPORT_SYMBOL(truncate_setsize); |
578 | 578 | ||
579 | /** | 579 | /** |
580 | * vmtruncate - unmap mappings "freed" by truncate() syscall | ||
581 | * @inode: inode of the file used | ||
582 | * @newsize: file offset to start truncating | ||
583 | * | ||
584 | * This function is deprecated and truncate_setsize or truncate_pagecache | ||
585 | * should be used instead, together with filesystem specific block truncation. | ||
586 | */ | ||
587 | int vmtruncate(struct inode *inode, loff_t newsize) | ||
588 | { | ||
589 | int error; | ||
590 | |||
591 | error = inode_newsize_ok(inode, newsize); | ||
592 | if (error) | ||
593 | return error; | ||
594 | |||
595 | truncate_setsize(inode, newsize); | ||
596 | if (inode->i_op->truncate) | ||
597 | inode->i_op->truncate(inode); | ||
598 | return 0; | ||
599 | } | ||
600 | EXPORT_SYMBOL(vmtruncate); | ||
601 | |||
602 | /** | ||
603 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched | 580 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched |
604 | * @inode: inode | 581 | * @inode: inode |
605 | * @lstart: offset of beginning of hole | 582 | * @lstart: offset of beginning of hole |
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc); | |||
152 | * | 152 | * |
153 | * The contents of the object pointed to are preserved up to the | 153 | * The contents of the object pointed to are preserved up to the |
154 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | 154 | * lesser of the new and old sizes. If @p is %NULL, krealloc() |
155 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | 155 | * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a |
156 | * %NULL pointer, the object pointed to is freed. | 156 | * %NULL pointer, the object pointed to is freed. |
157 | */ | 157 | */ |
158 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 158 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78e08300db21..5123a169ab7b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p) | |||
2550 | 2550 | ||
2551 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) | 2551 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) |
2552 | { | 2552 | { |
2553 | if (NUMA_BUILD) { | 2553 | if (IS_ENABLED(CONFIG_NUMA)) { |
2554 | unsigned int nr, *counters = m->private; | 2554 | unsigned int nr, *counters = m->private; |
2555 | 2555 | ||
2556 | if (!counters) | 2556 | if (!counters) |
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file) | |||
2615 | unsigned int *ptr = NULL; | 2615 | unsigned int *ptr = NULL; |
2616 | int ret; | 2616 | int ret; |
2617 | 2617 | ||
2618 | if (NUMA_BUILD) { | 2618 | if (IS_ENABLED(CONFIG_NUMA)) { |
2619 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | 2619 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); |
2620 | if (ptr == NULL) | 2620 | if (ptr == NULL) |
2621 | return -ENOMEM; | 2621 | return -ENOMEM; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 48550c66f1f2..196709f5ee58 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page) | |||
1177 | } | 1177 | } |
1178 | 1178 | ||
1179 | /* | 1179 | /* |
1180 | * Are there way too many processes in the direct reclaim path already? | 1180 | * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and |
1181 | * then get resheduled. When there are massive number of tasks doing page | ||
1182 | * allocation, such sleeping direct reclaimers may keep piling up on each CPU, | ||
1183 | * the LRU list will go small and be scanned faster than necessary, leading to | ||
1184 | * unnecessary swapping, thrashing and OOM. | ||
1181 | */ | 1185 | */ |
1182 | static int too_many_isolated(struct zone *zone, int file, | 1186 | static int too_many_isolated(struct zone *zone, int file, |
1183 | struct scan_control *sc) | 1187 | struct scan_control *sc) |
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1198 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); | 1202 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); |
1199 | } | 1203 | } |
1200 | 1204 | ||
1205 | /* | ||
1206 | * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they | ||
1207 | * won't get blocked by normal direct-reclaimers, forming a circular | ||
1208 | * deadlock. | ||
1209 | */ | ||
1210 | if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) | ||
1211 | inactive >>= 3; | ||
1212 | |||
1201 | return isolated > inactive; | 1213 | return isolated > inactive; |
1202 | } | 1214 | } |
1203 | 1215 | ||
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1679 | 1691 | ||
1680 | if (global_reclaim(sc)) { | 1692 | if (global_reclaim(sc)) { |
1681 | free = zone_page_state(zone, NR_FREE_PAGES); | 1693 | free = zone_page_state(zone, NR_FREE_PAGES); |
1682 | /* If we have very few page cache pages, | ||
1683 | force-scan anon pages. */ | ||
1684 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1694 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1695 | /* | ||
1696 | * If we have very few page cache pages, force-scan | ||
1697 | * anon pages. | ||
1698 | */ | ||
1685 | fraction[0] = 1; | 1699 | fraction[0] = 1; |
1686 | fraction[1] = 0; | 1700 | fraction[1] = 0; |
1687 | denominator = 1; | 1701 | denominator = 1; |
1688 | goto out; | 1702 | goto out; |
1703 | } else if (!inactive_file_is_low_global(zone)) { | ||
1704 | /* | ||
1705 | * There is enough inactive page cache, do not | ||
1706 | * reclaim anything from the working set right now. | ||
1707 | */ | ||
1708 | fraction[0] = 0; | ||
1709 | fraction[1] = 1; | ||
1710 | denominator = 1; | ||
1711 | goto out; | ||
1689 | } | 1712 | } |
1690 | } | 1713 | } |
1691 | 1714 | ||
@@ -1752,7 +1775,7 @@ out: | |||
1752 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 1775 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
1753 | static bool in_reclaim_compaction(struct scan_control *sc) | 1776 | static bool in_reclaim_compaction(struct scan_control *sc) |
1754 | { | 1777 | { |
1755 | if (COMPACTION_BUILD && sc->order && | 1778 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && |
1756 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || | 1779 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || |
1757 | sc->priority < DEF_PRIORITY - 2)) | 1780 | sc->priority < DEF_PRIORITY - 2)) |
1758 | return true; | 1781 | return true; |
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2005 | if (zone->all_unreclaimable && | 2028 | if (zone->all_unreclaimable && |
2006 | sc->priority != DEF_PRIORITY) | 2029 | sc->priority != DEF_PRIORITY) |
2007 | continue; /* Let kswapd poll it */ | 2030 | continue; /* Let kswapd poll it */ |
2008 | if (COMPACTION_BUILD) { | 2031 | if (IS_ENABLED(CONFIG_COMPACTION)) { |
2009 | /* | 2032 | /* |
2010 | * If we already have plenty of memory free for | 2033 | * If we already have plenty of memory free for |
2011 | * compaction in this zone, don't free any more. | 2034 | * compaction in this zone, don't free any more. |
@@ -2207,9 +2230,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2207 | * Throttle direct reclaimers if backing storage is backed by the network | 2230 | * Throttle direct reclaimers if backing storage is backed by the network |
2208 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously | 2231 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously |
2209 | * depleted. kswapd will continue to make progress and wake the processes | 2232 | * depleted. kswapd will continue to make progress and wake the processes |
2210 | * when the low watermark is reached | 2233 | * when the low watermark is reached. |
2234 | * | ||
2235 | * Returns true if a fatal signal was delivered during throttling. If this | ||
2236 | * happens, the page allocator should not consider triggering the OOM killer. | ||
2211 | */ | 2237 | */ |
2212 | static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | 2238 | static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, |
2213 | nodemask_t *nodemask) | 2239 | nodemask_t *nodemask) |
2214 | { | 2240 | { |
2215 | struct zone *zone; | 2241 | struct zone *zone; |
@@ -2224,13 +2250,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2224 | * processes to block on log_wait_commit(). | 2250 | * processes to block on log_wait_commit(). |
2225 | */ | 2251 | */ |
2226 | if (current->flags & PF_KTHREAD) | 2252 | if (current->flags & PF_KTHREAD) |
2227 | return; | 2253 | goto out; |
2254 | |||
2255 | /* | ||
2256 | * If a fatal signal is pending, this process should not throttle. | ||
2257 | * It should return quickly so it can exit and free its memory | ||
2258 | */ | ||
2259 | if (fatal_signal_pending(current)) | ||
2260 | goto out; | ||
2228 | 2261 | ||
2229 | /* Check if the pfmemalloc reserves are ok */ | 2262 | /* Check if the pfmemalloc reserves are ok */ |
2230 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | 2263 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); |
2231 | pgdat = zone->zone_pgdat; | 2264 | pgdat = zone->zone_pgdat; |
2232 | if (pfmemalloc_watermark_ok(pgdat)) | 2265 | if (pfmemalloc_watermark_ok(pgdat)) |
2233 | return; | 2266 | goto out; |
2234 | 2267 | ||
2235 | /* Account for the throttling */ | 2268 | /* Account for the throttling */ |
2236 | count_vm_event(PGSCAN_DIRECT_THROTTLE); | 2269 | count_vm_event(PGSCAN_DIRECT_THROTTLE); |
@@ -2246,12 +2279,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2246 | if (!(gfp_mask & __GFP_FS)) { | 2279 | if (!(gfp_mask & __GFP_FS)) { |
2247 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | 2280 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, |
2248 | pfmemalloc_watermark_ok(pgdat), HZ); | 2281 | pfmemalloc_watermark_ok(pgdat), HZ); |
2249 | return; | 2282 | |
2283 | goto check_pending; | ||
2250 | } | 2284 | } |
2251 | 2285 | ||
2252 | /* Throttle until kswapd wakes the process */ | 2286 | /* Throttle until kswapd wakes the process */ |
2253 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | 2287 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, |
2254 | pfmemalloc_watermark_ok(pgdat)); | 2288 | pfmemalloc_watermark_ok(pgdat)); |
2289 | |||
2290 | check_pending: | ||
2291 | if (fatal_signal_pending(current)) | ||
2292 | return true; | ||
2293 | |||
2294 | out: | ||
2295 | return false; | ||
2255 | } | 2296 | } |
2256 | 2297 | ||
2257 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2298 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
@@ -2273,13 +2314,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2273 | .gfp_mask = sc.gfp_mask, | 2314 | .gfp_mask = sc.gfp_mask, |
2274 | }; | 2315 | }; |
2275 | 2316 | ||
2276 | throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | ||
2277 | |||
2278 | /* | 2317 | /* |
2279 | * Do not enter reclaim if fatal signal is pending. 1 is returned so | 2318 | * Do not enter reclaim if fatal signal was delivered while throttled. |
2280 | * that the page allocator does not consider triggering OOM | 2319 | * 1 is returned so that the page allocator does not OOM kill at this |
2320 | * point. | ||
2281 | */ | 2321 | */ |
2282 | if (fatal_signal_pending(current)) | 2322 | if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) |
2283 | return 1; | 2323 | return 1; |
2284 | 2324 | ||
2285 | trace_mm_vmscan_direct_reclaim_begin(order, | 2325 | trace_mm_vmscan_direct_reclaim_begin(order, |
@@ -2397,13 +2437,31 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) | |||
2397 | } while (memcg); | 2437 | } while (memcg); |
2398 | } | 2438 | } |
2399 | 2439 | ||
2440 | static bool zone_balanced(struct zone *zone, int order, | ||
2441 | unsigned long balance_gap, int classzone_idx) | ||
2442 | { | ||
2443 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + | ||
2444 | balance_gap, classzone_idx, 0)) | ||
2445 | return false; | ||
2446 | |||
2447 | if (IS_ENABLED(CONFIG_COMPACTION) && order && | ||
2448 | !compaction_suitable(zone, order)) | ||
2449 | return false; | ||
2450 | |||
2451 | return true; | ||
2452 | } | ||
2453 | |||
2400 | /* | 2454 | /* |
2401 | * pgdat_balanced is used when checking if a node is balanced for high-order | 2455 | * pgdat_balanced() is used when checking if a node is balanced. |
2402 | * allocations. Only zones that meet watermarks and are in a zone allowed | 2456 | * |
2403 | * by the callers classzone_idx are added to balanced_pages. The total of | 2457 | * For order-0, all zones must be balanced! |
2404 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | 2458 | * |
2405 | * for the node to be considered balanced. Forcing all zones to be balanced | 2459 | * For high-order allocations only zones that meet watermarks and are in a |
2406 | * for high orders can cause excessive reclaim when there are imbalanced zones. | 2460 | * zone allowed by the callers classzone_idx are added to balanced_pages. The |
2461 | * total of balanced pages must be at least 25% of the zones allowed by | ||
2462 | * classzone_idx for the node to be considered balanced. Forcing all zones to | ||
2463 | * be balanced for high orders can cause excessive reclaim when there are | ||
2464 | * imbalanced zones. | ||
2407 | * The choice of 25% is due to | 2465 | * The choice of 25% is due to |
2408 | * o a 16M DMA zone that is balanced will not balance a zone on any | 2466 | * o a 16M DMA zone that is balanced will not balance a zone on any |
2409 | * reasonable sized machine | 2467 | * reasonable sized machine |
@@ -2413,17 +2471,43 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) | |||
2413 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | 2471 | * Similarly, on x86-64 the Normal zone would need to be at least 1G |
2414 | * to balance a node on its own. These seemed like reasonable ratios. | 2472 | * to balance a node on its own. These seemed like reasonable ratios. |
2415 | */ | 2473 | */ |
2416 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | 2474 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) |
2417 | int classzone_idx) | ||
2418 | { | 2475 | { |
2419 | unsigned long present_pages = 0; | 2476 | unsigned long present_pages = 0; |
2477 | unsigned long balanced_pages = 0; | ||
2420 | int i; | 2478 | int i; |
2421 | 2479 | ||
2422 | for (i = 0; i <= classzone_idx; i++) | 2480 | /* Check the watermark levels */ |
2423 | present_pages += pgdat->node_zones[i].present_pages; | 2481 | for (i = 0; i <= classzone_idx; i++) { |
2482 | struct zone *zone = pgdat->node_zones + i; | ||
2483 | |||
2484 | if (!populated_zone(zone)) | ||
2485 | continue; | ||
2486 | |||
2487 | present_pages += zone->present_pages; | ||
2424 | 2488 | ||
2425 | /* A special case here: if zone has no page, we think it's balanced */ | 2489 | /* |
2426 | return balanced_pages >= (present_pages >> 2); | 2490 | * A special case here: |
2491 | * | ||
2492 | * balance_pgdat() skips over all_unreclaimable after | ||
2493 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
2494 | * they must be considered balanced here as well! | ||
2495 | */ | ||
2496 | if (zone->all_unreclaimable) { | ||
2497 | balanced_pages += zone->present_pages; | ||
2498 | continue; | ||
2499 | } | ||
2500 | |||
2501 | if (zone_balanced(zone, order, 0, i)) | ||
2502 | balanced_pages += zone->present_pages; | ||
2503 | else if (!order) | ||
2504 | return false; | ||
2505 | } | ||
2506 | |||
2507 | if (order) | ||
2508 | return balanced_pages >= (present_pages >> 2); | ||
2509 | else | ||
2510 | return true; | ||
2427 | } | 2511 | } |
2428 | 2512 | ||
2429 | /* | 2513 | /* |
@@ -2435,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2435 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | 2519 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, |
2436 | int classzone_idx) | 2520 | int classzone_idx) |
2437 | { | 2521 | { |
2438 | int i; | ||
2439 | unsigned long balanced = 0; | ||
2440 | bool all_zones_ok = true; | ||
2441 | |||
2442 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2522 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2443 | if (remaining) | 2523 | if (remaining) |
2444 | return false; | 2524 | return false; |
@@ -2457,40 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2457 | return false; | 2537 | return false; |
2458 | } | 2538 | } |
2459 | 2539 | ||
2460 | /* Check the watermark levels */ | 2540 | return pgdat_balanced(pgdat, order, classzone_idx); |
2461 | for (i = 0; i <= classzone_idx; i++) { | ||
2462 | struct zone *zone = pgdat->node_zones + i; | ||
2463 | |||
2464 | if (!populated_zone(zone)) | ||
2465 | continue; | ||
2466 | |||
2467 | /* | ||
2468 | * balance_pgdat() skips over all_unreclaimable after | ||
2469 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
2470 | * they must be considered balanced here as well if kswapd | ||
2471 | * is to sleep | ||
2472 | */ | ||
2473 | if (zone->all_unreclaimable) { | ||
2474 | balanced += zone->present_pages; | ||
2475 | continue; | ||
2476 | } | ||
2477 | |||
2478 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), | ||
2479 | i, 0)) | ||
2480 | all_zones_ok = false; | ||
2481 | else | ||
2482 | balanced += zone->present_pages; | ||
2483 | } | ||
2484 | |||
2485 | /* | ||
2486 | * For high-order requests, the balanced zones must contain at least | ||
2487 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | ||
2488 | * must be balanced | ||
2489 | */ | ||
2490 | if (order) | ||
2491 | return pgdat_balanced(pgdat, balanced, classzone_idx); | ||
2492 | else | ||
2493 | return all_zones_ok; | ||
2494 | } | 2541 | } |
2495 | 2542 | ||
2496 | /* | 2543 | /* |
@@ -2517,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2517 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2564 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2518 | int *classzone_idx) | 2565 | int *classzone_idx) |
2519 | { | 2566 | { |
2520 | int all_zones_ok; | 2567 | struct zone *unbalanced_zone; |
2521 | unsigned long balanced; | ||
2522 | int i; | 2568 | int i; |
2523 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2569 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2524 | unsigned long total_scanned; | 2570 | unsigned long total_scanned; |
@@ -2551,8 +2597,7 @@ loop_again: | |||
2551 | unsigned long lru_pages = 0; | 2597 | unsigned long lru_pages = 0; |
2552 | int has_under_min_watermark_zone = 0; | 2598 | int has_under_min_watermark_zone = 0; |
2553 | 2599 | ||
2554 | all_zones_ok = 1; | 2600 | unbalanced_zone = NULL; |
2555 | balanced = 0; | ||
2556 | 2601 | ||
2557 | /* | 2602 | /* |
2558 | * Scan in the highmem->dma direction for the highest | 2603 | * Scan in the highmem->dma direction for the highest |
@@ -2585,8 +2630,7 @@ loop_again: | |||
2585 | break; | 2630 | break; |
2586 | } | 2631 | } |
2587 | 2632 | ||
2588 | if (!zone_watermark_ok_safe(zone, order, | 2633 | if (!zone_balanced(zone, order, 0, 0)) { |
2589 | high_wmark_pages(zone), 0, 0)) { | ||
2590 | end_zone = i; | 2634 | end_zone = i; |
2591 | break; | 2635 | break; |
2592 | } else { | 2636 | } else { |
@@ -2656,15 +2700,14 @@ loop_again: | |||
2656 | * Do not reclaim more than needed for compaction. | 2700 | * Do not reclaim more than needed for compaction. |
2657 | */ | 2701 | */ |
2658 | testorder = order; | 2702 | testorder = order; |
2659 | if (COMPACTION_BUILD && order && | 2703 | if (IS_ENABLED(CONFIG_COMPACTION) && order && |
2660 | compaction_suitable(zone, order) != | 2704 | compaction_suitable(zone, order) != |
2661 | COMPACT_SKIPPED) | 2705 | COMPACT_SKIPPED) |
2662 | testorder = 0; | 2706 | testorder = 0; |
2663 | 2707 | ||
2664 | if ((buffer_heads_over_limit && is_highmem_idx(i)) || | 2708 | if ((buffer_heads_over_limit && is_highmem_idx(i)) || |
2665 | !zone_watermark_ok_safe(zone, testorder, | 2709 | !zone_balanced(zone, testorder, |
2666 | high_wmark_pages(zone) + balance_gap, | 2710 | balance_gap, end_zone)) { |
2667 | end_zone, 0)) { | ||
2668 | shrink_zone(zone, &sc); | 2711 | shrink_zone(zone, &sc); |
2669 | 2712 | ||
2670 | reclaim_state->reclaimed_slab = 0; | 2713 | reclaim_state->reclaimed_slab = 0; |
@@ -2691,9 +2734,8 @@ loop_again: | |||
2691 | continue; | 2734 | continue; |
2692 | } | 2735 | } |
2693 | 2736 | ||
2694 | if (!zone_watermark_ok_safe(zone, testorder, | 2737 | if (!zone_balanced(zone, testorder, 0, end_zone)) { |
2695 | high_wmark_pages(zone), end_zone, 0)) { | 2738 | unbalanced_zone = zone; |
2696 | all_zones_ok = 0; | ||
2697 | /* | 2739 | /* |
2698 | * We are still under min water mark. This | 2740 | * We are still under min water mark. This |
2699 | * means that we have a GFP_ATOMIC allocation | 2741 | * means that we have a GFP_ATOMIC allocation |
@@ -2711,8 +2753,6 @@ loop_again: | |||
2711 | * speculatively avoid congestion waits | 2753 | * speculatively avoid congestion waits |
2712 | */ | 2754 | */ |
2713 | zone_clear_flag(zone, ZONE_CONGESTED); | 2755 | zone_clear_flag(zone, ZONE_CONGESTED); |
2714 | if (i <= *classzone_idx) | ||
2715 | balanced += zone->present_pages; | ||
2716 | } | 2756 | } |
2717 | 2757 | ||
2718 | } | 2758 | } |
@@ -2726,7 +2766,7 @@ loop_again: | |||
2726 | pfmemalloc_watermark_ok(pgdat)) | 2766 | pfmemalloc_watermark_ok(pgdat)) |
2727 | wake_up(&pgdat->pfmemalloc_wait); | 2767 | wake_up(&pgdat->pfmemalloc_wait); |
2728 | 2768 | ||
2729 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2769 | if (pgdat_balanced(pgdat, order, *classzone_idx)) |
2730 | break; /* kswapd: all done */ | 2770 | break; /* kswapd: all done */ |
2731 | /* | 2771 | /* |
2732 | * OK, kswapd is getting into trouble. Take a nap, then take | 2772 | * OK, kswapd is getting into trouble. Take a nap, then take |
@@ -2735,8 +2775,8 @@ loop_again: | |||
2735 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { | 2775 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { |
2736 | if (has_under_min_watermark_zone) | 2776 | if (has_under_min_watermark_zone) |
2737 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | 2777 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); |
2738 | else | 2778 | else if (unbalanced_zone) |
2739 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2779 | wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); |
2740 | } | 2780 | } |
2741 | 2781 | ||
2742 | /* | 2782 | /* |
@@ -2750,12 +2790,7 @@ loop_again: | |||
2750 | } while (--sc.priority >= 0); | 2790 | } while (--sc.priority >= 0); |
2751 | out: | 2791 | out: |
2752 | 2792 | ||
2753 | /* | 2793 | if (!pgdat_balanced(pgdat, order, *classzone_idx)) { |
2754 | * order-0: All zones must meet high watermark for a balanced node | ||
2755 | * high-order: Balanced zones must make up at least 25% of the node | ||
2756 | * for the node to be balanced | ||
2757 | */ | ||
2758 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | ||
2759 | cond_resched(); | 2794 | cond_resched(); |
2760 | 2795 | ||
2761 | try_to_freeze(); | 2796 | try_to_freeze(); |
@@ -2797,29 +2832,10 @@ out: | |||
2797 | if (!populated_zone(zone)) | 2832 | if (!populated_zone(zone)) |
2798 | continue; | 2833 | continue; |
2799 | 2834 | ||
2800 | if (zone->all_unreclaimable && | ||
2801 | sc.priority != DEF_PRIORITY) | ||
2802 | continue; | ||
2803 | |||
2804 | /* Would compaction fail due to lack of free memory? */ | ||
2805 | if (COMPACTION_BUILD && | ||
2806 | compaction_suitable(zone, order) == COMPACT_SKIPPED) | ||
2807 | goto loop_again; | ||
2808 | |||
2809 | /* Confirm the zone is balanced for order-0 */ | ||
2810 | if (!zone_watermark_ok(zone, 0, | ||
2811 | high_wmark_pages(zone), 0, 0)) { | ||
2812 | order = sc.order = 0; | ||
2813 | goto loop_again; | ||
2814 | } | ||
2815 | |||
2816 | /* Check if the memory needs to be defragmented. */ | 2835 | /* Check if the memory needs to be defragmented. */ |
2817 | if (zone_watermark_ok(zone, order, | 2836 | if (zone_watermark_ok(zone, order, |
2818 | low_wmark_pages(zone), *classzone_idx, 0)) | 2837 | low_wmark_pages(zone), *classzone_idx, 0)) |
2819 | zones_need_compaction = 0; | 2838 | zones_need_compaction = 0; |
2820 | |||
2821 | /* If balanced, clear the congested flag */ | ||
2822 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2823 | } | 2839 | } |
2824 | 2840 | ||
2825 | if (zones_need_compaction) | 2841 | if (zones_need_compaction) |
@@ -2944,7 +2960,7 @@ static int kswapd(void *p) | |||
2944 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2960 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2945 | balanced_classzone_idx = classzone_idx; | 2961 | balanced_classzone_idx = classzone_idx; |
2946 | for ( ; ; ) { | 2962 | for ( ; ; ) { |
2947 | int ret; | 2963 | bool ret; |
2948 | 2964 | ||
2949 | /* | 2965 | /* |
2950 | * If the last balance_pgdat was unsuccessful it's unlikely a | 2966 | * If the last balance_pgdat was unsuccessful it's unlikely a |
@@ -3106,13 +3122,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3106 | not required for correctness. So if the last cpu in a node goes | 3122 | not required for correctness. So if the last cpu in a node goes |
3107 | away, we get changed to run anywhere: as the first one comes back, | 3123 | away, we get changed to run anywhere: as the first one comes back, |
3108 | restore their cpu bindings. */ | 3124 | restore their cpu bindings. */ |
3109 | static int __devinit cpu_callback(struct notifier_block *nfb, | 3125 | static int cpu_callback(struct notifier_block *nfb, unsigned long action, |
3110 | unsigned long action, void *hcpu) | 3126 | void *hcpu) |
3111 | { | 3127 | { |
3112 | int nid; | 3128 | int nid; |
3113 | 3129 | ||
3114 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 3130 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
3115 | for_each_node_state(nid, N_HIGH_MEMORY) { | 3131 | for_each_node_state(nid, N_MEMORY) { |
3116 | pg_data_t *pgdat = NODE_DATA(nid); | 3132 | pg_data_t *pgdat = NODE_DATA(nid); |
3117 | const struct cpumask *mask; | 3133 | const struct cpumask *mask; |
3118 | 3134 | ||
@@ -3168,7 +3184,7 @@ static int __init kswapd_init(void) | |||
3168 | int nid; | 3184 | int nid; |
3169 | 3185 | ||
3170 | swap_setup(); | 3186 | swap_setup(); |
3171 | for_each_node_state(nid, N_HIGH_MEMORY) | 3187 | for_each_node_state(nid, N_MEMORY) |
3172 | kswapd_run(nid); | 3188 | kswapd_run(nid); |
3173 | hotcpu_notifier(cpu_callback, 0); | 3189 | hotcpu_notifier(cpu_callback, 0); |
3174 | return 0; | 3190 | return 0; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c7370579111b..9800306c8195 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = { | |||
774 | 774 | ||
775 | "pgrotated", | 775 | "pgrotated", |
776 | 776 | ||
777 | #ifdef CONFIG_NUMA_BALANCING | ||
778 | "numa_pte_updates", | ||
779 | "numa_hint_faults", | ||
780 | "numa_hint_faults_local", | ||
781 | "numa_pages_migrated", | ||
782 | #endif | ||
783 | #ifdef CONFIG_MIGRATION | ||
784 | "pgmigrate_success", | ||
785 | "pgmigrate_fail", | ||
786 | #endif | ||
777 | #ifdef CONFIG_COMPACTION | 787 | #ifdef CONFIG_COMPACTION |
778 | "compact_blocks_moved", | 788 | "compact_migrate_scanned", |
779 | "compact_pages_moved", | 789 | "compact_free_scanned", |
780 | "compact_pagemigrate_failed", | 790 | "compact_isolated", |
781 | "compact_stall", | 791 | "compact_stall", |
782 | "compact_fail", | 792 | "compact_fail", |
783 | "compact_success", | 793 | "compact_success", |
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = { | |||
801 | "thp_collapse_alloc", | 811 | "thp_collapse_alloc", |
802 | "thp_collapse_alloc_failed", | 812 | "thp_collapse_alloc_failed", |
803 | "thp_split", | 813 | "thp_split", |
814 | "thp_zero_page_alloc", | ||
815 | "thp_zero_page_alloc_failed", | ||
804 | #endif | 816 | #endif |
805 | 817 | ||
806 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 818 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
930 | pg_data_t *pgdat = (pg_data_t *)arg; | 942 | pg_data_t *pgdat = (pg_data_t *)arg; |
931 | 943 | ||
932 | /* check memoryless node */ | 944 | /* check memoryless node */ |
933 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 945 | if (!node_state(pgdat->node_id, N_MEMORY)) |
934 | return 0; | 946 | return 0; |
935 | 947 | ||
936 | seq_printf(m, "Page block order: %d\n", pageblock_order); | 948 | seq_printf(m, "Page block order: %d\n", pageblock_order); |
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
992 | "\n high %lu" | 1004 | "\n high %lu" |
993 | "\n scanned %lu" | 1005 | "\n scanned %lu" |
994 | "\n spanned %lu" | 1006 | "\n spanned %lu" |
995 | "\n present %lu", | 1007 | "\n present %lu" |
1008 | "\n managed %lu", | ||
996 | zone_page_state(zone, NR_FREE_PAGES), | 1009 | zone_page_state(zone, NR_FREE_PAGES), |
997 | min_wmark_pages(zone), | 1010 | min_wmark_pages(zone), |
998 | low_wmark_pages(zone), | 1011 | low_wmark_pages(zone), |
999 | high_wmark_pages(zone), | 1012 | high_wmark_pages(zone), |
1000 | zone->pages_scanned, | 1013 | zone->pages_scanned, |
1001 | zone->spanned_pages, | 1014 | zone->spanned_pages, |
1002 | zone->present_pages); | 1015 | zone->present_pages, |
1016 | zone->managed_pages); | ||
1003 | 1017 | ||
1004 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 1018 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
1005 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 1019 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg) | |||
1292 | pg_data_t *pgdat = (pg_data_t *)arg; | 1306 | pg_data_t *pgdat = (pg_data_t *)arg; |
1293 | 1307 | ||
1294 | /* check memoryless node */ | 1308 | /* check memoryless node */ |
1295 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 1309 | if (!node_state(pgdat->node_id, N_MEMORY)) |
1296 | return 0; | 1310 | return 0; |
1297 | 1311 | ||
1298 | walk_zones_in_node(m, pgdat, unusable_show_print); | 1312 | walk_zones_in_node(m, pgdat, unusable_show_print); |