diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 34 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 302 | ||||
-rw-r--r-- | mm/bootmem.c | 79 | ||||
-rw-r--r-- | mm/compaction.c | 150 | ||||
-rw-r--r-- | mm/dmapool.c | 55 | ||||
-rw-r--r-- | mm/highmem.c | 29 | ||||
-rw-r--r-- | mm/huge_memory.c | 641 | ||||
-rw-r--r-- | mm/hugetlb.c | 63 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 42 | ||||
-rw-r--r-- | mm/internal.h | 12 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/ksm.c | 37 | ||||
-rw-r--r-- | mm/memcontrol.c | 1469 | ||||
-rw-r--r-- | mm/memory-failure.c | 35 | ||||
-rw-r--r-- | mm/memory.c | 238 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 430 | ||||
-rw-r--r-- | mm/mempolicy.c | 318 | ||||
-rw-r--r-- | mm/migrate.c | 438 | ||||
-rw-r--r-- | mm/mmap.c | 569 | ||||
-rw-r--r-- | mm/mprotect.c | 151 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nobootmem.c | 22 | ||||
-rw-r--r-- | mm/nommu.c | 15 | ||||
-rw-r--r-- | mm/oom_kill.c | 138 | ||||
-rw-r--r-- | mm/page-writeback.c | 11 | ||||
-rw-r--r-- | mm/page_alloc.c | 343 | ||||
-rw-r--r-- | mm/page_cgroup.c | 5 | ||||
-rw-r--r-- | mm/page_isolation.c | 27 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 134 | ||||
-rw-r--r-- | mm/shmem.c | 92 | ||||
-rw-r--r-- | mm/slab.c | 383 | ||||
-rw-r--r-- | mm/slab.h | 190 | ||||
-rw-r--r-- | mm/slab_common.c | 292 | ||||
-rw-r--r-- | mm/slob.c | 48 | ||||
-rw-r--r-- | mm/slub.c | 451 | ||||
-rw-r--r-- | mm/sparse.c | 25 | ||||
-rw-r--r-- | mm/swapfile.c | 31 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 56 | ||||
-rw-r--r-- | mm/vmstat.c | 28 |
45 files changed, 5664 insertions, 1751 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a3f8dddaaab3..278e3ab1f169 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -143,6 +143,25 @@ config NO_BOOTMEM | |||
143 | config MEMORY_ISOLATION | 143 | config MEMORY_ISOLATION |
144 | boolean | 144 | boolean |
145 | 145 | ||
146 | config MOVABLE_NODE | ||
147 | boolean "Enable to assign a node which has only movable memory" | ||
148 | depends on HAVE_MEMBLOCK | ||
149 | depends on NO_BOOTMEM | ||
150 | depends on X86_64 | ||
151 | depends on NUMA | ||
152 | default n | ||
153 | help | ||
154 | Allow a node to have only movable memory. Pages used by the kernel, | ||
155 | such as direct mapping pages cannot be migrated. So the corresponding | ||
156 | memory device cannot be hotplugged. This option allows users to | ||
157 | online all the memory of a node as movable memory so that the whole | ||
158 | node can be hotplugged. Users who don't use the memory hotplug | ||
159 | feature are fine with this option on since they don't online memory | ||
160 | as movable. | ||
161 | |||
162 | Say Y here if you want to hotplug a whole node. | ||
163 | Say N here if you want kernel to use memory on all nodes evenly. | ||
164 | |||
146 | # eventually, we can have this option just 'select SPARSEMEM' | 165 | # eventually, we can have this option just 'select SPARSEMEM' |
147 | config MEMORY_HOTPLUG | 166 | config MEMORY_HOTPLUG |
148 | bool "Allow for memory hot-add" | 167 | bool "Allow for memory hot-add" |
@@ -188,6 +207,21 @@ config SPLIT_PTLOCK_CPUS | |||
188 | default "4" | 207 | default "4" |
189 | 208 | ||
190 | # | 209 | # |
210 | # support for memory balloon compaction | ||
211 | config BALLOON_COMPACTION | ||
212 | bool "Allow for balloon memory compaction/migration" | ||
213 | def_bool y | ||
214 | depends on COMPACTION && VIRTIO_BALLOON | ||
215 | help | ||
216 | Memory fragmentation introduced by ballooning might reduce | ||
217 | significantly the number of 2MB contiguous memory blocks that can be | ||
218 | used within a guest, thus imposing performance penalties associated | ||
219 | with the reduced number of transparent huge pages that could be used | ||
220 | by the guest workload. Allowing the compaction & migration for memory | ||
221 | pages enlisted as being part of memory balloon devices avoids the | ||
222 | scenario aforementioned and helps improving memory defragmentation. | ||
223 | |||
224 | # | ||
191 | # support for memory compaction | 225 | # support for memory compaction |
192 | config COMPACTION | 226 | config COMPACTION |
193 | bool "Allow for memory compaction" | 227 | bool "Allow for memory compaction" |
diff --git a/mm/Makefile b/mm/Makefile index 6b025f80af34..3a4628751f89 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o interval_tree.o $(mmu-y) | 19 | compaction.o balloon_compaction.o \ |
20 | interval_tree.o $(mmu-y) | ||
20 | 21 | ||
21 | obj-y += init-mm.o | 22 | obj-y += init-mm.o |
22 | 23 | ||
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 000000000000..07dbc8ec46cf --- /dev/null +++ b/mm/balloon_compaction.c | |||
@@ -0,0 +1,302 @@ | |||
1 | /* | ||
2 | * mm/balloon_compaction.c | ||
3 | * | ||
4 | * Common interface for making balloon pages movable by compaction. | ||
5 | * | ||
6 | * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> | ||
7 | */ | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/export.h> | ||
11 | #include <linux/balloon_compaction.h> | ||
12 | |||
13 | /* | ||
14 | * balloon_devinfo_alloc - allocates a balloon device information descriptor. | ||
15 | * @balloon_dev_descriptor: pointer to reference the balloon device which | ||
16 | * this struct balloon_dev_info will be servicing. | ||
17 | * | ||
18 | * Driver must call it to properly allocate and initialize an instance of | ||
19 | * struct balloon_dev_info which will be used to reference a balloon device | ||
20 | * as well as to keep track of the balloon device page list. | ||
21 | */ | ||
22 | struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) | ||
23 | { | ||
24 | struct balloon_dev_info *b_dev_info; | ||
25 | b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); | ||
26 | if (!b_dev_info) | ||
27 | return ERR_PTR(-ENOMEM); | ||
28 | |||
29 | b_dev_info->balloon_device = balloon_dev_descriptor; | ||
30 | b_dev_info->mapping = NULL; | ||
31 | b_dev_info->isolated_pages = 0; | ||
32 | spin_lock_init(&b_dev_info->pages_lock); | ||
33 | INIT_LIST_HEAD(&b_dev_info->pages); | ||
34 | |||
35 | return b_dev_info; | ||
36 | } | ||
37 | EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); | ||
38 | |||
39 | /* | ||
40 | * balloon_page_enqueue - allocates a new page and inserts it into the balloon | ||
41 | * page list. | ||
42 | * @b_dev_info: balloon device decriptor where we will insert a new page to | ||
43 | * | ||
44 | * Driver must call it to properly allocate a new enlisted balloon page | ||
45 | * before definetively removing it from the guest system. | ||
46 | * This function returns the page address for the recently enqueued page or | ||
47 | * NULL in the case we fail to allocate a new page this turn. | ||
48 | */ | ||
49 | struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) | ||
50 | { | ||
51 | unsigned long flags; | ||
52 | struct page *page = alloc_page(balloon_mapping_gfp_mask() | | ||
53 | __GFP_NOMEMALLOC | __GFP_NORETRY); | ||
54 | if (!page) | ||
55 | return NULL; | ||
56 | |||
57 | /* | ||
58 | * Block others from accessing the 'page' when we get around to | ||
59 | * establishing additional references. We should be the only one | ||
60 | * holding a reference to the 'page' at this point. | ||
61 | */ | ||
62 | BUG_ON(!trylock_page(page)); | ||
63 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
64 | balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); | ||
65 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
66 | unlock_page(page); | ||
67 | return page; | ||
68 | } | ||
69 | EXPORT_SYMBOL_GPL(balloon_page_enqueue); | ||
70 | |||
71 | /* | ||
72 | * balloon_page_dequeue - removes a page from balloon's page list and returns | ||
73 | * the its address to allow the driver release the page. | ||
74 | * @b_dev_info: balloon device decriptor where we will grab a page from. | ||
75 | * | ||
76 | * Driver must call it to properly de-allocate a previous enlisted balloon page | ||
77 | * before definetively releasing it back to the guest system. | ||
78 | * This function returns the page address for the recently dequeued page or | ||
79 | * NULL in the case we find balloon's page list temporarily empty due to | ||
80 | * compaction isolated pages. | ||
81 | */ | ||
82 | struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | ||
83 | { | ||
84 | struct page *page, *tmp; | ||
85 | unsigned long flags; | ||
86 | bool dequeued_page; | ||
87 | |||
88 | dequeued_page = false; | ||
89 | list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { | ||
90 | /* | ||
91 | * Block others from accessing the 'page' while we get around | ||
92 | * establishing additional references and preparing the 'page' | ||
93 | * to be released by the balloon driver. | ||
94 | */ | ||
95 | if (trylock_page(page)) { | ||
96 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
97 | /* | ||
98 | * Raise the page refcount here to prevent any wrong | ||
99 | * attempt to isolate this page, in case of coliding | ||
100 | * with balloon_page_isolate() just after we release | ||
101 | * the page lock. | ||
102 | * | ||
103 | * balloon_page_free() will take care of dropping | ||
104 | * this extra refcount later. | ||
105 | */ | ||
106 | get_page(page); | ||
107 | balloon_page_delete(page); | ||
108 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
109 | unlock_page(page); | ||
110 | dequeued_page = true; | ||
111 | break; | ||
112 | } | ||
113 | } | ||
114 | |||
115 | if (!dequeued_page) { | ||
116 | /* | ||
117 | * If we are unable to dequeue a balloon page because the page | ||
118 | * list is empty and there is no isolated pages, then something | ||
119 | * went out of track and some balloon pages are lost. | ||
120 | * BUG() here, otherwise the balloon driver may get stuck into | ||
121 | * an infinite loop while attempting to release all its pages. | ||
122 | */ | ||
123 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
124 | if (unlikely(list_empty(&b_dev_info->pages) && | ||
125 | !b_dev_info->isolated_pages)) | ||
126 | BUG(); | ||
127 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
128 | page = NULL; | ||
129 | } | ||
130 | return page; | ||
131 | } | ||
132 | EXPORT_SYMBOL_GPL(balloon_page_dequeue); | ||
133 | |||
134 | #ifdef CONFIG_BALLOON_COMPACTION | ||
135 | /* | ||
136 | * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. | ||
137 | * @b_dev_info: holds the balloon device information descriptor. | ||
138 | * @a_ops: balloon_mapping address_space_operations descriptor. | ||
139 | * | ||
140 | * Driver must call it to properly allocate and initialize an instance of | ||
141 | * struct address_space which will be used as the special page->mapping for | ||
142 | * balloon device enlisted page instances. | ||
143 | */ | ||
144 | struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, | ||
145 | const struct address_space_operations *a_ops) | ||
146 | { | ||
147 | struct address_space *mapping; | ||
148 | |||
149 | mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); | ||
150 | if (!mapping) | ||
151 | return ERR_PTR(-ENOMEM); | ||
152 | |||
153 | /* | ||
154 | * Give a clean 'zeroed' status to all elements of this special | ||
155 | * balloon page->mapping struct address_space instance. | ||
156 | */ | ||
157 | address_space_init_once(mapping); | ||
158 | |||
159 | /* | ||
160 | * Set mapping->flags appropriately, to allow balloon pages | ||
161 | * ->mapping identification. | ||
162 | */ | ||
163 | mapping_set_balloon(mapping); | ||
164 | mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); | ||
165 | |||
166 | /* balloon's page->mapping->a_ops callback descriptor */ | ||
167 | mapping->a_ops = a_ops; | ||
168 | |||
169 | /* | ||
170 | * Establish a pointer reference back to the balloon device descriptor | ||
171 | * this particular page->mapping will be servicing. | ||
172 | * This is used by compaction / migration procedures to identify and | ||
173 | * access the balloon device pageset while isolating / migrating pages. | ||
174 | * | ||
175 | * As some balloon drivers can register multiple balloon devices | ||
176 | * for a single guest, this also helps compaction / migration to | ||
177 | * properly deal with multiple balloon pagesets, when required. | ||
178 | */ | ||
179 | mapping->private_data = b_dev_info; | ||
180 | b_dev_info->mapping = mapping; | ||
181 | |||
182 | return mapping; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(balloon_mapping_alloc); | ||
185 | |||
186 | static inline void __isolate_balloon_page(struct page *page) | ||
187 | { | ||
188 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | ||
189 | unsigned long flags; | ||
190 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
191 | list_del(&page->lru); | ||
192 | b_dev_info->isolated_pages++; | ||
193 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
194 | } | ||
195 | |||
196 | static inline void __putback_balloon_page(struct page *page) | ||
197 | { | ||
198 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | ||
199 | unsigned long flags; | ||
200 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | ||
201 | list_add(&page->lru, &b_dev_info->pages); | ||
202 | b_dev_info->isolated_pages--; | ||
203 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | ||
204 | } | ||
205 | |||
206 | static inline int __migrate_balloon_page(struct address_space *mapping, | ||
207 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
208 | { | ||
209 | return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); | ||
210 | } | ||
211 | |||
212 | /* __isolate_lru_page() counterpart for a ballooned page */ | ||
213 | bool balloon_page_isolate(struct page *page) | ||
214 | { | ||
215 | /* | ||
216 | * Avoid burning cycles with pages that are yet under __free_pages(), | ||
217 | * or just got freed under us. | ||
218 | * | ||
219 | * In case we 'win' a race for a balloon page being freed under us and | ||
220 | * raise its refcount preventing __free_pages() from doing its job | ||
221 | * the put_page() at the end of this block will take care of | ||
222 | * release this page, thus avoiding a nasty leakage. | ||
223 | */ | ||
224 | if (likely(get_page_unless_zero(page))) { | ||
225 | /* | ||
226 | * As balloon pages are not isolated from LRU lists, concurrent | ||
227 | * compaction threads can race against page migration functions | ||
228 | * as well as race against the balloon driver releasing a page. | ||
229 | * | ||
230 | * In order to avoid having an already isolated balloon page | ||
231 | * being (wrongly) re-isolated while it is under migration, | ||
232 | * or to avoid attempting to isolate pages being released by | ||
233 | * the balloon driver, lets be sure we have the page lock | ||
234 | * before proceeding with the balloon page isolation steps. | ||
235 | */ | ||
236 | if (likely(trylock_page(page))) { | ||
237 | /* | ||
238 | * A ballooned page, by default, has just one refcount. | ||
239 | * Prevent concurrent compaction threads from isolating | ||
240 | * an already isolated balloon page by refcount check. | ||
241 | */ | ||
242 | if (__is_movable_balloon_page(page) && | ||
243 | page_count(page) == 2) { | ||
244 | __isolate_balloon_page(page); | ||
245 | unlock_page(page); | ||
246 | return true; | ||
247 | } | ||
248 | unlock_page(page); | ||
249 | } | ||
250 | put_page(page); | ||
251 | } | ||
252 | return false; | ||
253 | } | ||
254 | |||
255 | /* putback_lru_page() counterpart for a ballooned page */ | ||
256 | void balloon_page_putback(struct page *page) | ||
257 | { | ||
258 | /* | ||
259 | * 'lock_page()' stabilizes the page and prevents races against | ||
260 | * concurrent isolation threads attempting to re-isolate it. | ||
261 | */ | ||
262 | lock_page(page); | ||
263 | |||
264 | if (__is_movable_balloon_page(page)) { | ||
265 | __putback_balloon_page(page); | ||
266 | /* drop the extra ref count taken for page isolation */ | ||
267 | put_page(page); | ||
268 | } else { | ||
269 | WARN_ON(1); | ||
270 | dump_page(page); | ||
271 | } | ||
272 | unlock_page(page); | ||
273 | } | ||
274 | |||
275 | /* move_to_new_page() counterpart for a ballooned page */ | ||
276 | int balloon_page_migrate(struct page *newpage, | ||
277 | struct page *page, enum migrate_mode mode) | ||
278 | { | ||
279 | struct address_space *mapping; | ||
280 | int rc = -EAGAIN; | ||
281 | |||
282 | /* | ||
283 | * Block others from accessing the 'newpage' when we get around to | ||
284 | * establishing additional references. We should be the only one | ||
285 | * holding a reference to the 'newpage' at this point. | ||
286 | */ | ||
287 | BUG_ON(!trylock_page(newpage)); | ||
288 | |||
289 | if (WARN_ON(!__is_movable_balloon_page(page))) { | ||
290 | dump_page(page); | ||
291 | unlock_page(newpage); | ||
292 | return rc; | ||
293 | } | ||
294 | |||
295 | mapping = page->mapping; | ||
296 | if (mapping) | ||
297 | rc = __migrate_balloon_page(mapping, newpage, page, mode); | ||
298 | |||
299 | unlock_page(newpage); | ||
300 | return rc; | ||
301 | } | ||
302 | #endif /* CONFIG_BALLOON_COMPACTION */ | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..1324cd74faec 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
147 | 147 | ||
148 | /* | 148 | /* |
149 | * free_bootmem_late - free bootmem pages directly to page allocator | 149 | * free_bootmem_late - free bootmem pages directly to page allocator |
150 | * @addr: starting address of the range | 150 | * @addr: starting physical address of the range |
151 | * @size: size of the range in bytes | 151 | * @size: size of the range in bytes |
152 | * | 152 | * |
153 | * This is only useful when the bootmem allocator has already been torn | 153 | * This is only useful when the bootmem allocator has already been torn |
154 | * down, but we are still initializing the system. Pages are given directly | 154 | * down, but we are still initializing the system. Pages are given directly |
155 | * to the page allocator, no bootmem metadata is updated because it is gone. | 155 | * to the page allocator, no bootmem metadata is updated because it is gone. |
156 | */ | 156 | */ |
157 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | 157 | void __init free_bootmem_late(unsigned long physaddr, unsigned long size) |
158 | { | 158 | { |
159 | unsigned long cursor, end; | 159 | unsigned long cursor, end; |
160 | 160 | ||
161 | kmemleak_free_part(__va(addr), size); | 161 | kmemleak_free_part(__va(physaddr), size); |
162 | 162 | ||
163 | cursor = PFN_UP(addr); | 163 | cursor = PFN_UP(physaddr); |
164 | end = PFN_DOWN(addr + size); | 164 | end = PFN_DOWN(physaddr + size); |
165 | 165 | ||
166 | for (; cursor < end; cursor++) { | 166 | for (; cursor < end; cursor++) { |
167 | __free_pages_bootmem(pfn_to_page(cursor), 0); | 167 | __free_pages_bootmem(pfn_to_page(cursor), 0); |
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
229 | return count; | 229 | return count; |
230 | } | 230 | } |
231 | 231 | ||
232 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
233 | { | ||
234 | struct zone *z; | ||
235 | |||
236 | /* | ||
237 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
238 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
239 | * zones. So there's no need to recalculate managed_pages because all | ||
240 | * highmem pages will be managed by the buddy system. Here highmem | ||
241 | * zone also includes highmem movable zone. | ||
242 | */ | ||
243 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
244 | if (!is_highmem(z)) | ||
245 | z->managed_pages = 0; | ||
246 | } | ||
247 | |||
232 | /** | 248 | /** |
233 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 249 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
234 | * @pgdat: node to be released | 250 | * @pgdat: node to be released |
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
238 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 254 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
239 | { | 255 | { |
240 | register_page_bootmem_info_node(pgdat); | 256 | register_page_bootmem_info_node(pgdat); |
257 | reset_node_lowmem_managed_pages(pgdat); | ||
241 | return free_all_bootmem_core(pgdat->bdata); | 258 | return free_all_bootmem_core(pgdat->bdata); |
242 | } | 259 | } |
243 | 260 | ||
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void) | |||
250 | { | 267 | { |
251 | unsigned long total_pages = 0; | 268 | unsigned long total_pages = 0; |
252 | bootmem_data_t *bdata; | 269 | bootmem_data_t *bdata; |
270 | struct pglist_data *pgdat; | ||
271 | |||
272 | for_each_online_pgdat(pgdat) | ||
273 | reset_node_lowmem_managed_pages(pgdat); | ||
253 | 274 | ||
254 | list_for_each_entry(bdata, &bdata_list, list) | 275 | list_for_each_entry(bdata, &bdata_list, list) |
255 | total_pages += free_all_bootmem_core(bdata); | 276 | total_pages += free_all_bootmem_core(bdata); |
@@ -377,21 +398,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
377 | 398 | ||
378 | /** | 399 | /** |
379 | * free_bootmem - mark a page range as usable | 400 | * free_bootmem - mark a page range as usable |
380 | * @addr: starting address of the range | 401 | * @addr: starting physical address of the range |
381 | * @size: size of the range in bytes | 402 | * @size: size of the range in bytes |
382 | * | 403 | * |
383 | * Partial pages will be considered reserved and left as they are. | 404 | * Partial pages will be considered reserved and left as they are. |
384 | * | 405 | * |
385 | * The range must be contiguous but may span node boundaries. | 406 | * The range must be contiguous but may span node boundaries. |
386 | */ | 407 | */ |
387 | void __init free_bootmem(unsigned long addr, unsigned long size) | 408 | void __init free_bootmem(unsigned long physaddr, unsigned long size) |
388 | { | 409 | { |
389 | unsigned long start, end; | 410 | unsigned long start, end; |
390 | 411 | ||
391 | kmemleak_free_part(__va(addr), size); | 412 | kmemleak_free_part(__va(physaddr), size); |
392 | 413 | ||
393 | start = PFN_UP(addr); | 414 | start = PFN_UP(physaddr); |
394 | end = PFN_DOWN(addr + size); | 415 | end = PFN_DOWN(physaddr + size); |
395 | 416 | ||
396 | mark_bootmem(start, end, 0, 0); | 417 | mark_bootmem(start, end, 0, 0); |
397 | } | 418 | } |
@@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, | |||
439 | return mark_bootmem(start, end, 1, flags); | 460 | return mark_bootmem(start, end, 1, flags); |
440 | } | 461 | } |
441 | 462 | ||
442 | int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
443 | int flags) | ||
444 | { | ||
445 | return reserve_bootmem(phys, len, flags); | ||
446 | } | ||
447 | |||
448 | static unsigned long __init align_idx(struct bootmem_data *bdata, | 463 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
449 | unsigned long idx, unsigned long step) | 464 | unsigned long idx, unsigned long step) |
450 | { | 465 | { |
@@ -575,27 +590,6 @@ find_block: | |||
575 | return NULL; | 590 | return NULL; |
576 | } | 591 | } |
577 | 592 | ||
578 | static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | ||
579 | unsigned long size, unsigned long align, | ||
580 | unsigned long goal, unsigned long limit) | ||
581 | { | ||
582 | if (WARN_ON_ONCE(slab_is_available())) | ||
583 | return kzalloc(size, GFP_NOWAIT); | ||
584 | |||
585 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM | ||
586 | { | ||
587 | bootmem_data_t *p_bdata; | ||
588 | |||
589 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | ||
590 | goal, limit); | ||
591 | if (p_bdata) | ||
592 | return alloc_bootmem_bdata(p_bdata, size, align, | ||
593 | goal, limit); | ||
594 | } | ||
595 | #endif | ||
596 | return NULL; | ||
597 | } | ||
598 | |||
599 | static void * __init alloc_bootmem_core(unsigned long size, | 593 | static void * __init alloc_bootmem_core(unsigned long size, |
600 | unsigned long align, | 594 | unsigned long align, |
601 | unsigned long goal, | 595 | unsigned long goal, |
@@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size, | |||
604 | bootmem_data_t *bdata; | 598 | bootmem_data_t *bdata; |
605 | void *region; | 599 | void *region; |
606 | 600 | ||
607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); | 601 | if (WARN_ON_ONCE(slab_is_available())) |
608 | if (region) | 602 | return kzalloc(size, GFP_NOWAIT); |
609 | return region; | ||
610 | 603 | ||
611 | list_for_each_entry(bdata, &bdata_list, list) { | 604 | list_for_each_entry(bdata, &bdata_list, list) { |
612 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) | 605 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) |
@@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | |||
704 | { | 697 | { |
705 | void *ptr; | 698 | void *ptr; |
706 | 699 | ||
700 | if (WARN_ON_ONCE(slab_is_available())) | ||
701 | return kzalloc(size, GFP_NOWAIT); | ||
707 | again: | 702 | again: |
708 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, | ||
709 | align, goal, limit); | ||
710 | if (ptr) | ||
711 | return ptr; | ||
712 | 703 | ||
713 | /* do not panic in alloc_bootmem_bdata() */ | 704 | /* do not panic in alloc_bootmem_bdata() */ |
714 | if (limit && goal + size > limit) | 705 | if (limit && goal + size > limit) |
diff --git a/mm/compaction.c b/mm/compaction.c index 694eaabaaebd..5ad7f4f4d6f7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/backing-dev.h> | 14 | #include <linux/backing-dev.h> |
15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include <linux/balloon_compaction.h> | ||
17 | #include "internal.h" | 18 | #include "internal.h" |
18 | 19 | ||
19 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 20 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
@@ -214,60 +215,6 @@ static bool suitable_migration_target(struct page *page) | |||
214 | return false; | 215 | return false; |
215 | } | 216 | } |
216 | 217 | ||
217 | static void compact_capture_page(struct compact_control *cc) | ||
218 | { | ||
219 | unsigned long flags; | ||
220 | int mtype, mtype_low, mtype_high; | ||
221 | |||
222 | if (!cc->page || *cc->page) | ||
223 | return; | ||
224 | |||
225 | /* | ||
226 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
227 | * regardless of the migratetype of the freelist is is captured from. | ||
228 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
229 | * allocation is typically at least a pageblock size and overall | ||
230 | * fragmentation is not impaired. Other allocation types must | ||
231 | * capture pages from their own migratelist because otherwise they | ||
232 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
233 | * difficult to move pages and making fragmentation worse overall. | ||
234 | */ | ||
235 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
236 | mtype_low = 0; | ||
237 | mtype_high = MIGRATE_PCPTYPES; | ||
238 | } else { | ||
239 | mtype_low = cc->migratetype; | ||
240 | mtype_high = cc->migratetype + 1; | ||
241 | } | ||
242 | |||
243 | /* Speculatively examine the free lists without zone lock */ | ||
244 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
245 | int order; | ||
246 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
247 | struct page *page; | ||
248 | struct free_area *area; | ||
249 | area = &(cc->zone->free_area[order]); | ||
250 | if (list_empty(&area->free_list[mtype])) | ||
251 | continue; | ||
252 | |||
253 | /* Take the lock and attempt capture of the page */ | ||
254 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
255 | return; | ||
256 | if (!list_empty(&area->free_list[mtype])) { | ||
257 | page = list_entry(area->free_list[mtype].next, | ||
258 | struct page, lru); | ||
259 | if (capture_free_page(page, cc->order, mtype)) { | ||
260 | spin_unlock_irqrestore(&cc->zone->lock, | ||
261 | flags); | ||
262 | *cc->page = page; | ||
263 | return; | ||
264 | } | ||
265 | } | ||
266 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
267 | } | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* | 218 | /* |
272 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 219 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
273 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 220 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
@@ -356,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
356 | if (blockpfn == end_pfn) | 303 | if (blockpfn == end_pfn) |
357 | update_pageblock_skip(cc, valid_page, total_isolated, false); | 304 | update_pageblock_skip(cc, valid_page, total_isolated, false); |
358 | 305 | ||
306 | count_vm_events(COMPACTFREE_SCANNED, nr_scanned); | ||
307 | if (total_isolated) | ||
308 | count_vm_events(COMPACTISOLATED, total_isolated); | ||
309 | |||
359 | return total_isolated; | 310 | return total_isolated; |
360 | } | 311 | } |
361 | 312 | ||
@@ -565,9 +516,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
565 | goto next_pageblock; | 516 | goto next_pageblock; |
566 | } | 517 | } |
567 | 518 | ||
568 | /* Check may be lockless but that's ok as we recheck later */ | 519 | /* |
569 | if (!PageLRU(page)) | 520 | * Check may be lockless but that's ok as we recheck later. |
521 | * It's possible to migrate LRU pages and balloon pages | ||
522 | * Skip any other type of page | ||
523 | */ | ||
524 | if (!PageLRU(page)) { | ||
525 | if (unlikely(balloon_page_movable(page))) { | ||
526 | if (locked && balloon_page_isolate(page)) { | ||
527 | /* Successfully isolated */ | ||
528 | cc->finished_update_migrate = true; | ||
529 | list_add(&page->lru, migratelist); | ||
530 | cc->nr_migratepages++; | ||
531 | nr_isolated++; | ||
532 | goto check_compact_cluster; | ||
533 | } | ||
534 | } | ||
570 | continue; | 535 | continue; |
536 | } | ||
571 | 537 | ||
572 | /* | 538 | /* |
573 | * PageLRU is set. lru_lock normally excludes isolation | 539 | * PageLRU is set. lru_lock normally excludes isolation |
@@ -621,6 +587,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
621 | cc->nr_migratepages++; | 587 | cc->nr_migratepages++; |
622 | nr_isolated++; | 588 | nr_isolated++; |
623 | 589 | ||
590 | check_compact_cluster: | ||
624 | /* Avoid isolating too much */ | 591 | /* Avoid isolating too much */ |
625 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | 592 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
626 | ++low_pfn; | 593 | ++low_pfn; |
@@ -646,6 +613,10 @@ next_pageblock: | |||
646 | 613 | ||
647 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 614 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
648 | 615 | ||
616 | count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); | ||
617 | if (nr_isolated) | ||
618 | count_vm_events(COMPACTISOLATED, nr_isolated); | ||
619 | |||
649 | return low_pfn; | 620 | return low_pfn; |
650 | } | 621 | } |
651 | 622 | ||
@@ -936,6 +907,60 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
936 | return COMPACT_CONTINUE; | 907 | return COMPACT_CONTINUE; |
937 | } | 908 | } |
938 | 909 | ||
910 | static void compact_capture_page(struct compact_control *cc) | ||
911 | { | ||
912 | unsigned long flags; | ||
913 | int mtype, mtype_low, mtype_high; | ||
914 | |||
915 | if (!cc->page || *cc->page) | ||
916 | return; | ||
917 | |||
918 | /* | ||
919 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
920 | * regardless of the migratetype of the freelist is is captured from. | ||
921 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
922 | * allocation is typically at least a pageblock size and overall | ||
923 | * fragmentation is not impaired. Other allocation types must | ||
924 | * capture pages from their own migratelist because otherwise they | ||
925 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
926 | * difficult to move pages and making fragmentation worse overall. | ||
927 | */ | ||
928 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
929 | mtype_low = 0; | ||
930 | mtype_high = MIGRATE_PCPTYPES; | ||
931 | } else { | ||
932 | mtype_low = cc->migratetype; | ||
933 | mtype_high = cc->migratetype + 1; | ||
934 | } | ||
935 | |||
936 | /* Speculatively examine the free lists without zone lock */ | ||
937 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
938 | int order; | ||
939 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
940 | struct page *page; | ||
941 | struct free_area *area; | ||
942 | area = &(cc->zone->free_area[order]); | ||
943 | if (list_empty(&area->free_list[mtype])) | ||
944 | continue; | ||
945 | |||
946 | /* Take the lock and attempt capture of the page */ | ||
947 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
948 | return; | ||
949 | if (!list_empty(&area->free_list[mtype])) { | ||
950 | page = list_entry(area->free_list[mtype].next, | ||
951 | struct page, lru); | ||
952 | if (capture_free_page(page, cc->order, mtype)) { | ||
953 | spin_unlock_irqrestore(&cc->zone->lock, | ||
954 | flags); | ||
955 | *cc->page = page; | ||
956 | return; | ||
957 | } | ||
958 | } | ||
959 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
960 | } | ||
961 | } | ||
962 | } | ||
963 | |||
939 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 964 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
940 | { | 965 | { |
941 | int ret; | 966 | int ret; |
@@ -986,7 +1011,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
986 | switch (isolate_migratepages(zone, cc)) { | 1011 | switch (isolate_migratepages(zone, cc)) { |
987 | case ISOLATE_ABORT: | 1012 | case ISOLATE_ABORT: |
988 | ret = COMPACT_PARTIAL; | 1013 | ret = COMPACT_PARTIAL; |
989 | putback_lru_pages(&cc->migratepages); | 1014 | putback_movable_pages(&cc->migratepages); |
990 | cc->nr_migratepages = 0; | 1015 | cc->nr_migratepages = 0; |
991 | goto out; | 1016 | goto out; |
992 | case ISOLATE_NONE: | 1017 | case ISOLATE_NONE: |
@@ -998,20 +1023,17 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
998 | nr_migrate = cc->nr_migratepages; | 1023 | nr_migrate = cc->nr_migratepages; |
999 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1024 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1000 | (unsigned long)cc, false, | 1025 | (unsigned long)cc, false, |
1001 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); | 1026 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, |
1027 | MR_COMPACTION); | ||
1002 | update_nr_listpages(cc); | 1028 | update_nr_listpages(cc); |
1003 | nr_remaining = cc->nr_migratepages; | 1029 | nr_remaining = cc->nr_migratepages; |
1004 | 1030 | ||
1005 | count_vm_event(COMPACTBLOCKS); | ||
1006 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | ||
1007 | if (nr_remaining) | ||
1008 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | ||
1009 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 1031 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, |
1010 | nr_remaining); | 1032 | nr_remaining); |
1011 | 1033 | ||
1012 | /* Release LRU pages not migrated */ | 1034 | /* Release isolated pages not migrated */ |
1013 | if (err) { | 1035 | if (err) { |
1014 | putback_lru_pages(&cc->migratepages); | 1036 | putback_movable_pages(&cc->migratepages); |
1015 | cc->nr_migratepages = 0; | 1037 | cc->nr_migratepages = 0; |
1016 | if (err == -ENOMEM) { | 1038 | if (err == -ENOMEM) { |
1017 | ret = COMPACT_PARTIAL; | 1039 | ret = COMPACT_PARTIAL; |
diff --git a/mm/dmapool.c b/mm/dmapool.c index c5ab33bca0a8..c69781e97cf9 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */ | |||
50 | size_t allocation; | 50 | size_t allocation; |
51 | size_t boundary; | 51 | size_t boundary; |
52 | char name[32]; | 52 | char name[32]; |
53 | wait_queue_head_t waitq; | ||
54 | struct list_head pools; | 53 | struct list_head pools; |
55 | }; | 54 | }; |
56 | 55 | ||
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ | |||
62 | unsigned int offset; | 61 | unsigned int offset; |
63 | }; | 62 | }; |
64 | 63 | ||
65 | #define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) | ||
66 | |||
67 | static DEFINE_MUTEX(pools_lock); | 64 | static DEFINE_MUTEX(pools_lock); |
68 | 65 | ||
69 | static ssize_t | 66 | static ssize_t |
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
172 | retval->size = size; | 169 | retval->size = size; |
173 | retval->boundary = boundary; | 170 | retval->boundary = boundary; |
174 | retval->allocation = allocation; | 171 | retval->allocation = allocation; |
175 | init_waitqueue_head(&retval->waitq); | ||
176 | 172 | ||
177 | if (dev) { | 173 | if (dev) { |
178 | int ret; | 174 | int ret; |
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) | |||
227 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); | 223 | memset(page->vaddr, POOL_POISON_FREED, pool->allocation); |
228 | #endif | 224 | #endif |
229 | pool_initialise_page(pool, page); | 225 | pool_initialise_page(pool, page); |
230 | list_add(&page->page_list, &pool->page_list); | ||
231 | page->in_use = 0; | 226 | page->in_use = 0; |
232 | page->offset = 0; | 227 | page->offset = 0; |
233 | } else { | 228 | } else { |
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
315 | might_sleep_if(mem_flags & __GFP_WAIT); | 310 | might_sleep_if(mem_flags & __GFP_WAIT); |
316 | 311 | ||
317 | spin_lock_irqsave(&pool->lock, flags); | 312 | spin_lock_irqsave(&pool->lock, flags); |
318 | restart: | ||
319 | list_for_each_entry(page, &pool->page_list, page_list) { | 313 | list_for_each_entry(page, &pool->page_list, page_list) { |
320 | if (page->offset < pool->allocation) | 314 | if (page->offset < pool->allocation) |
321 | goto ready; | 315 | goto ready; |
322 | } | 316 | } |
323 | page = pool_alloc_page(pool, GFP_ATOMIC); | ||
324 | if (!page) { | ||
325 | if (mem_flags & __GFP_WAIT) { | ||
326 | DECLARE_WAITQUEUE(wait, current); | ||
327 | 317 | ||
328 | __set_current_state(TASK_UNINTERRUPTIBLE); | 318 | /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ |
329 | __add_wait_queue(&pool->waitq, &wait); | 319 | spin_unlock_irqrestore(&pool->lock, flags); |
330 | spin_unlock_irqrestore(&pool->lock, flags); | ||
331 | 320 | ||
332 | schedule_timeout(POOL_TIMEOUT_JIFFIES); | 321 | page = pool_alloc_page(pool, mem_flags); |
322 | if (!page) | ||
323 | return NULL; | ||
333 | 324 | ||
334 | spin_lock_irqsave(&pool->lock, flags); | 325 | spin_lock_irqsave(&pool->lock, flags); |
335 | __remove_wait_queue(&pool->waitq, &wait); | ||
336 | goto restart; | ||
337 | } | ||
338 | retval = NULL; | ||
339 | goto done; | ||
340 | } | ||
341 | 326 | ||
327 | list_add(&page->page_list, &pool->page_list); | ||
342 | ready: | 328 | ready: |
343 | page->in_use++; | 329 | page->in_use++; |
344 | offset = page->offset; | 330 | offset = page->offset; |
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, | |||
346 | retval = offset + page->vaddr; | 332 | retval = offset + page->vaddr; |
347 | *handle = offset + page->dma; | 333 | *handle = offset + page->dma; |
348 | #ifdef DMAPOOL_DEBUG | 334 | #ifdef DMAPOOL_DEBUG |
335 | { | ||
336 | int i; | ||
337 | u8 *data = retval; | ||
338 | /* page->offset is stored in first 4 bytes */ | ||
339 | for (i = sizeof(page->offset); i < pool->size; i++) { | ||
340 | if (data[i] == POOL_POISON_FREED) | ||
341 | continue; | ||
342 | if (pool->dev) | ||
343 | dev_err(pool->dev, | ||
344 | "dma_pool_alloc %s, %p (corruped)\n", | ||
345 | pool->name, retval); | ||
346 | else | ||
347 | pr_err("dma_pool_alloc %s, %p (corruped)\n", | ||
348 | pool->name, retval); | ||
349 | |||
350 | /* | ||
351 | * Dump the first 4 bytes even if they are not | ||
352 | * POOL_POISON_FREED | ||
353 | */ | ||
354 | print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, | ||
355 | data, pool->size, 1); | ||
356 | break; | ||
357 | } | ||
358 | } | ||
349 | memset(retval, POOL_POISON_ALLOCATED, pool->size); | 359 | memset(retval, POOL_POISON_ALLOCATED, pool->size); |
350 | #endif | 360 | #endif |
351 | done: | ||
352 | spin_unlock_irqrestore(&pool->lock, flags); | 361 | spin_unlock_irqrestore(&pool->lock, flags); |
353 | return retval; | 362 | return retval; |
354 | } | 363 | } |
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) | |||
435 | page->in_use--; | 444 | page->in_use--; |
436 | *(int *)vaddr = page->offset; | 445 | *(int *)vaddr = page->offset; |
437 | page->offset = offset; | 446 | page->offset = offset; |
438 | if (waitqueue_active(&pool->waitq)) | ||
439 | wake_up_locked(&pool->waitq); | ||
440 | /* | 447 | /* |
441 | * Resist a temptation to do | 448 | * Resist a temptation to do |
442 | * if (!is_page_busy(page)) pool_free_page(pool, page); | 449 | * if (!is_page_busy(page)) pool_free_page(pool, page); |
diff --git a/mm/highmem.c b/mm/highmem.c index 2da13a5c50e2..d999077431df 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr) | |||
99 | unsigned long addr = (unsigned long)vaddr; | 99 | unsigned long addr = (unsigned long)vaddr; |
100 | 100 | ||
101 | if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { | 101 | if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { |
102 | int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; | 102 | int i = PKMAP_NR(addr); |
103 | return pte_page(pkmap_page_table[i]); | 103 | return pte_page(pkmap_page_table[i]); |
104 | } | 104 | } |
105 | 105 | ||
@@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void) | |||
137 | * So no dangers, even with speculative execution. | 137 | * So no dangers, even with speculative execution. |
138 | */ | 138 | */ |
139 | page = pte_page(pkmap_page_table[i]); | 139 | page = pte_page(pkmap_page_table[i]); |
140 | pte_clear(&init_mm, (unsigned long)page_address(page), | 140 | pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); |
141 | &pkmap_page_table[i]); | ||
142 | 141 | ||
143 | set_page_address(page, NULL); | 142 | set_page_address(page, NULL); |
144 | need_flush = 1; | 143 | need_flush = 1; |
@@ -324,11 +323,7 @@ struct page_address_map { | |||
324 | struct list_head list; | 323 | struct list_head list; |
325 | }; | 324 | }; |
326 | 325 | ||
327 | /* | 326 | static struct page_address_map page_address_maps[LAST_PKMAP]; |
328 | * page_address_map freelist, allocated from page_address_maps. | ||
329 | */ | ||
330 | static struct list_head page_address_pool; /* freelist */ | ||
331 | static spinlock_t pool_lock; /* protects page_address_pool */ | ||
332 | 327 | ||
333 | /* | 328 | /* |
334 | * Hash table bucket | 329 | * Hash table bucket |
@@ -393,14 +388,7 @@ void set_page_address(struct page *page, void *virtual) | |||
393 | 388 | ||
394 | pas = page_slot(page); | 389 | pas = page_slot(page); |
395 | if (virtual) { /* Add */ | 390 | if (virtual) { /* Add */ |
396 | BUG_ON(list_empty(&page_address_pool)); | 391 | pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; |
397 | |||
398 | spin_lock_irqsave(&pool_lock, flags); | ||
399 | pam = list_entry(page_address_pool.next, | ||
400 | struct page_address_map, list); | ||
401 | list_del(&pam->list); | ||
402 | spin_unlock_irqrestore(&pool_lock, flags); | ||
403 | |||
404 | pam->page = page; | 392 | pam->page = page; |
405 | pam->virtual = virtual; | 393 | pam->virtual = virtual; |
406 | 394 | ||
@@ -413,9 +401,6 @@ void set_page_address(struct page *page, void *virtual) | |||
413 | if (pam->page == page) { | 401 | if (pam->page == page) { |
414 | list_del(&pam->list); | 402 | list_del(&pam->list); |
415 | spin_unlock_irqrestore(&pas->lock, flags); | 403 | spin_unlock_irqrestore(&pas->lock, flags); |
416 | spin_lock_irqsave(&pool_lock, flags); | ||
417 | list_add_tail(&pam->list, &page_address_pool); | ||
418 | spin_unlock_irqrestore(&pool_lock, flags); | ||
419 | goto done; | 404 | goto done; |
420 | } | 405 | } |
421 | } | 406 | } |
@@ -425,20 +410,14 @@ done: | |||
425 | return; | 410 | return; |
426 | } | 411 | } |
427 | 412 | ||
428 | static struct page_address_map page_address_maps[LAST_PKMAP]; | ||
429 | |||
430 | void __init page_address_init(void) | 413 | void __init page_address_init(void) |
431 | { | 414 | { |
432 | int i; | 415 | int i; |
433 | 416 | ||
434 | INIT_LIST_HEAD(&page_address_pool); | ||
435 | for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) | ||
436 | list_add(&page_address_maps[i].list, &page_address_pool); | ||
437 | for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { | 417 | for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { |
438 | INIT_LIST_HEAD(&page_address_htable[i].lh); | 418 | INIT_LIST_HEAD(&page_address_htable[i].lh); |
439 | spin_lock_init(&page_address_htable[i].lock); | 419 | spin_lock_init(&page_address_htable[i].lock); |
440 | } | 420 | } |
441 | spin_lock_init(&pool_lock); | ||
442 | } | 421 | } |
443 | 422 | ||
444 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ | 423 | #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c34b415..32754eece63e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -12,12 +12,15 @@ | |||
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/shrinker.h> | ||
15 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
16 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
17 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
18 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
19 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
20 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | ||
23 | |||
21 | #include <asm/tlb.h> | 24 | #include <asm/tlb.h> |
22 | #include <asm/pgalloc.h> | 25 | #include <asm/pgalloc.h> |
23 | #include "internal.h" | 26 | #include "internal.h" |
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly = | |||
37 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | 40 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| |
38 | #endif | 41 | #endif |
39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | 42 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| |
40 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 43 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
44 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
41 | 45 | ||
42 | /* default scan 8*512 pte (or vmas) every 30 second */ | 46 | /* default scan 8*512 pte (or vmas) every 30 second */ |
43 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | 47 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; |
@@ -159,6 +163,77 @@ static int start_khugepaged(void) | |||
159 | return err; | 163 | return err; |
160 | } | 164 | } |
161 | 165 | ||
166 | static atomic_t huge_zero_refcount; | ||
167 | static unsigned long huge_zero_pfn __read_mostly; | ||
168 | |||
169 | static inline bool is_huge_zero_pfn(unsigned long pfn) | ||
170 | { | ||
171 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | ||
172 | return zero_pfn && pfn == zero_pfn; | ||
173 | } | ||
174 | |||
175 | static inline bool is_huge_zero_pmd(pmd_t pmd) | ||
176 | { | ||
177 | return is_huge_zero_pfn(pmd_pfn(pmd)); | ||
178 | } | ||
179 | |||
180 | static unsigned long get_huge_zero_page(void) | ||
181 | { | ||
182 | struct page *zero_page; | ||
183 | retry: | ||
184 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | ||
185 | return ACCESS_ONCE(huge_zero_pfn); | ||
186 | |||
187 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | ||
188 | HPAGE_PMD_ORDER); | ||
189 | if (!zero_page) { | ||
190 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | ||
191 | return 0; | ||
192 | } | ||
193 | count_vm_event(THP_ZERO_PAGE_ALLOC); | ||
194 | preempt_disable(); | ||
195 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | ||
196 | preempt_enable(); | ||
197 | __free_page(zero_page); | ||
198 | goto retry; | ||
199 | } | ||
200 | |||
201 | /* We take additional reference here. It will be put back by shrinker */ | ||
202 | atomic_set(&huge_zero_refcount, 2); | ||
203 | preempt_enable(); | ||
204 | return ACCESS_ONCE(huge_zero_pfn); | ||
205 | } | ||
206 | |||
207 | static void put_huge_zero_page(void) | ||
208 | { | ||
209 | /* | ||
210 | * Counter should never go to zero here. Only shrinker can put | ||
211 | * last reference. | ||
212 | */ | ||
213 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | ||
214 | } | ||
215 | |||
216 | static int shrink_huge_zero_page(struct shrinker *shrink, | ||
217 | struct shrink_control *sc) | ||
218 | { | ||
219 | if (!sc->nr_to_scan) | ||
220 | /* we can free zero page only if last reference remains */ | ||
221 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | ||
222 | |||
223 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | ||
224 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | ||
225 | BUG_ON(zero_pfn == 0); | ||
226 | __free_page(__pfn_to_page(zero_pfn)); | ||
227 | } | ||
228 | |||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | static struct shrinker huge_zero_page_shrinker = { | ||
233 | .shrink = shrink_huge_zero_page, | ||
234 | .seeks = DEFAULT_SEEKS, | ||
235 | }; | ||
236 | |||
162 | #ifdef CONFIG_SYSFS | 237 | #ifdef CONFIG_SYSFS |
163 | 238 | ||
164 | static ssize_t double_flag_show(struct kobject *kobj, | 239 | static ssize_t double_flag_show(struct kobject *kobj, |
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj, | |||
284 | static struct kobj_attribute defrag_attr = | 359 | static struct kobj_attribute defrag_attr = |
285 | __ATTR(defrag, 0644, defrag_show, defrag_store); | 360 | __ATTR(defrag, 0644, defrag_show, defrag_store); |
286 | 361 | ||
362 | static ssize_t use_zero_page_show(struct kobject *kobj, | ||
363 | struct kobj_attribute *attr, char *buf) | ||
364 | { | ||
365 | return single_flag_show(kobj, attr, buf, | ||
366 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
367 | } | ||
368 | static ssize_t use_zero_page_store(struct kobject *kobj, | ||
369 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
370 | { | ||
371 | return single_flag_store(kobj, attr, buf, count, | ||
372 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
373 | } | ||
374 | static struct kobj_attribute use_zero_page_attr = | ||
375 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); | ||
287 | #ifdef CONFIG_DEBUG_VM | 376 | #ifdef CONFIG_DEBUG_VM |
288 | static ssize_t debug_cow_show(struct kobject *kobj, | 377 | static ssize_t debug_cow_show(struct kobject *kobj, |
289 | struct kobj_attribute *attr, char *buf) | 378 | struct kobj_attribute *attr, char *buf) |
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr = | |||
305 | static struct attribute *hugepage_attr[] = { | 394 | static struct attribute *hugepage_attr[] = { |
306 | &enabled_attr.attr, | 395 | &enabled_attr.attr, |
307 | &defrag_attr.attr, | 396 | &defrag_attr.attr, |
397 | &use_zero_page_attr.attr, | ||
308 | #ifdef CONFIG_DEBUG_VM | 398 | #ifdef CONFIG_DEBUG_VM |
309 | &debug_cow_attr.attr, | 399 | &debug_cow_attr.attr, |
310 | #endif | 400 | #endif |
@@ -550,6 +640,8 @@ static int __init hugepage_init(void) | |||
550 | goto out; | 640 | goto out; |
551 | } | 641 | } |
552 | 642 | ||
643 | register_shrinker(&huge_zero_page_shrinker); | ||
644 | |||
553 | /* | 645 | /* |
554 | * By default disable transparent hugepages on smaller systems, | 646 | * By default disable transparent hugepages on smaller systems, |
555 | * where the extra memory used could hurt more than TLB overhead | 647 | * where the extra memory used could hurt more than TLB overhead |
@@ -599,13 +691,22 @@ out: | |||
599 | } | 691 | } |
600 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 692 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
601 | 693 | ||
602 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 694 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
603 | { | 695 | { |
604 | if (likely(vma->vm_flags & VM_WRITE)) | 696 | if (likely(vma->vm_flags & VM_WRITE)) |
605 | pmd = pmd_mkwrite(pmd); | 697 | pmd = pmd_mkwrite(pmd); |
606 | return pmd; | 698 | return pmd; |
607 | } | 699 | } |
608 | 700 | ||
701 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) | ||
702 | { | ||
703 | pmd_t entry; | ||
704 | entry = mk_pmd(page, vma->vm_page_prot); | ||
705 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
706 | entry = pmd_mkhuge(entry); | ||
707 | return entry; | ||
708 | } | ||
709 | |||
609 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 710 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
610 | struct vm_area_struct *vma, | 711 | struct vm_area_struct *vma, |
611 | unsigned long haddr, pmd_t *pmd, | 712 | unsigned long haddr, pmd_t *pmd, |
@@ -629,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
629 | pte_free(mm, pgtable); | 730 | pte_free(mm, pgtable); |
630 | } else { | 731 | } else { |
631 | pmd_t entry; | 732 | pmd_t entry; |
632 | entry = mk_pmd(page, vma->vm_page_prot); | 733 | entry = mk_huge_pmd(page, vma); |
633 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
634 | entry = pmd_mkhuge(entry); | ||
635 | /* | 734 | /* |
636 | * The spinlocking to take the lru_lock inside | 735 | * The spinlocking to take the lru_lock inside |
637 | * page_add_new_anon_rmap() acts as a full memory | 736 | * page_add_new_anon_rmap() acts as a full memory |
@@ -671,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag) | |||
671 | } | 770 | } |
672 | #endif | 771 | #endif |
673 | 772 | ||
773 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | ||
774 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | ||
775 | unsigned long zero_pfn) | ||
776 | { | ||
777 | pmd_t entry; | ||
778 | if (!pmd_none(*pmd)) | ||
779 | return false; | ||
780 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | ||
781 | entry = pmd_wrprotect(entry); | ||
782 | entry = pmd_mkhuge(entry); | ||
783 | set_pmd_at(mm, haddr, pmd, entry); | ||
784 | pgtable_trans_huge_deposit(mm, pgtable); | ||
785 | mm->nr_ptes++; | ||
786 | return true; | ||
787 | } | ||
788 | |||
674 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 789 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
675 | unsigned long address, pmd_t *pmd, | 790 | unsigned long address, pmd_t *pmd, |
676 | unsigned int flags) | 791 | unsigned int flags) |
@@ -684,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
684 | return VM_FAULT_OOM; | 799 | return VM_FAULT_OOM; |
685 | if (unlikely(khugepaged_enter(vma))) | 800 | if (unlikely(khugepaged_enter(vma))) |
686 | return VM_FAULT_OOM; | 801 | return VM_FAULT_OOM; |
802 | if (!(flags & FAULT_FLAG_WRITE) && | ||
803 | transparent_hugepage_use_zero_page()) { | ||
804 | pgtable_t pgtable; | ||
805 | unsigned long zero_pfn; | ||
806 | bool set; | ||
807 | pgtable = pte_alloc_one(mm, haddr); | ||
808 | if (unlikely(!pgtable)) | ||
809 | return VM_FAULT_OOM; | ||
810 | zero_pfn = get_huge_zero_page(); | ||
811 | if (unlikely(!zero_pfn)) { | ||
812 | pte_free(mm, pgtable); | ||
813 | count_vm_event(THP_FAULT_FALLBACK); | ||
814 | goto out; | ||
815 | } | ||
816 | spin_lock(&mm->page_table_lock); | ||
817 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
818 | zero_pfn); | ||
819 | spin_unlock(&mm->page_table_lock); | ||
820 | if (!set) { | ||
821 | pte_free(mm, pgtable); | ||
822 | put_huge_zero_page(); | ||
823 | } | ||
824 | return 0; | ||
825 | } | ||
687 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 826 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
688 | vma, haddr, numa_node_id(), 0); | 827 | vma, haddr, numa_node_id(), 0); |
689 | if (unlikely(!page)) { | 828 | if (unlikely(!page)) { |
@@ -710,7 +849,8 @@ out: | |||
710 | * run pte_offset_map on the pmd, if an huge pmd could | 849 | * run pte_offset_map on the pmd, if an huge pmd could |
711 | * materialize from under us from a different thread. | 850 | * materialize from under us from a different thread. |
712 | */ | 851 | */ |
713 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | 852 | if (unlikely(pmd_none(*pmd)) && |
853 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
714 | return VM_FAULT_OOM; | 854 | return VM_FAULT_OOM; |
715 | /* if an huge pmd materialized from under us just retry later */ | 855 | /* if an huge pmd materialized from under us just retry later */ |
716 | if (unlikely(pmd_trans_huge(*pmd))) | 856 | if (unlikely(pmd_trans_huge(*pmd))) |
@@ -748,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
748 | pte_free(dst_mm, pgtable); | 888 | pte_free(dst_mm, pgtable); |
749 | goto out_unlock; | 889 | goto out_unlock; |
750 | } | 890 | } |
891 | /* | ||
892 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | ||
893 | * under splitting since we don't split the page itself, only pmd to | ||
894 | * a page table. | ||
895 | */ | ||
896 | if (is_huge_zero_pmd(pmd)) { | ||
897 | unsigned long zero_pfn; | ||
898 | bool set; | ||
899 | /* | ||
900 | * get_huge_zero_page() will never allocate a new page here, | ||
901 | * since we already have a zero page to copy. It just takes a | ||
902 | * reference. | ||
903 | */ | ||
904 | zero_pfn = get_huge_zero_page(); | ||
905 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | ||
906 | zero_pfn); | ||
907 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | ||
908 | ret = 0; | ||
909 | goto out_unlock; | ||
910 | } | ||
751 | if (unlikely(pmd_trans_splitting(pmd))) { | 911 | if (unlikely(pmd_trans_splitting(pmd))) { |
752 | /* split huge page running from under us */ | 912 | /* split huge page running from under us */ |
753 | spin_unlock(&src_mm->page_table_lock); | 913 | spin_unlock(&src_mm->page_table_lock); |
@@ -777,6 +937,102 @@ out: | |||
777 | return ret; | 937 | return ret; |
778 | } | 938 | } |
779 | 939 | ||
940 | void huge_pmd_set_accessed(struct mm_struct *mm, | ||
941 | struct vm_area_struct *vma, | ||
942 | unsigned long address, | ||
943 | pmd_t *pmd, pmd_t orig_pmd, | ||
944 | int dirty) | ||
945 | { | ||
946 | pmd_t entry; | ||
947 | unsigned long haddr; | ||
948 | |||
949 | spin_lock(&mm->page_table_lock); | ||
950 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
951 | goto unlock; | ||
952 | |||
953 | entry = pmd_mkyoung(orig_pmd); | ||
954 | haddr = address & HPAGE_PMD_MASK; | ||
955 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) | ||
956 | update_mmu_cache_pmd(vma, address, pmd); | ||
957 | |||
958 | unlock: | ||
959 | spin_unlock(&mm->page_table_lock); | ||
960 | } | ||
961 | |||
962 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | ||
963 | struct vm_area_struct *vma, unsigned long address, | ||
964 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | ||
965 | { | ||
966 | pgtable_t pgtable; | ||
967 | pmd_t _pmd; | ||
968 | struct page *page; | ||
969 | int i, ret = 0; | ||
970 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
971 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
972 | |||
973 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
974 | if (!page) { | ||
975 | ret |= VM_FAULT_OOM; | ||
976 | goto out; | ||
977 | } | ||
978 | |||
979 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
980 | put_page(page); | ||
981 | ret |= VM_FAULT_OOM; | ||
982 | goto out; | ||
983 | } | ||
984 | |||
985 | clear_user_highpage(page, address); | ||
986 | __SetPageUptodate(page); | ||
987 | |||
988 | mmun_start = haddr; | ||
989 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
990 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
991 | |||
992 | spin_lock(&mm->page_table_lock); | ||
993 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
994 | goto out_free_page; | ||
995 | |||
996 | pmdp_clear_flush(vma, haddr, pmd); | ||
997 | /* leave pmd empty until pte is filled */ | ||
998 | |||
999 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
1000 | pmd_populate(mm, &_pmd, pgtable); | ||
1001 | |||
1002 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
1003 | pte_t *pte, entry; | ||
1004 | if (haddr == (address & PAGE_MASK)) { | ||
1005 | entry = mk_pte(page, vma->vm_page_prot); | ||
1006 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1007 | page_add_new_anon_rmap(page, vma, haddr); | ||
1008 | } else { | ||
1009 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
1010 | entry = pte_mkspecial(entry); | ||
1011 | } | ||
1012 | pte = pte_offset_map(&_pmd, haddr); | ||
1013 | VM_BUG_ON(!pte_none(*pte)); | ||
1014 | set_pte_at(mm, haddr, pte, entry); | ||
1015 | pte_unmap(pte); | ||
1016 | } | ||
1017 | smp_wmb(); /* make pte visible before pmd */ | ||
1018 | pmd_populate(mm, pmd, pgtable); | ||
1019 | spin_unlock(&mm->page_table_lock); | ||
1020 | put_huge_zero_page(); | ||
1021 | inc_mm_counter(mm, MM_ANONPAGES); | ||
1022 | |||
1023 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1024 | |||
1025 | ret |= VM_FAULT_WRITE; | ||
1026 | out: | ||
1027 | return ret; | ||
1028 | out_free_page: | ||
1029 | spin_unlock(&mm->page_table_lock); | ||
1030 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1031 | mem_cgroup_uncharge_page(page); | ||
1032 | put_page(page); | ||
1033 | goto out; | ||
1034 | } | ||
1035 | |||
780 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1036 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
781 | struct vm_area_struct *vma, | 1037 | struct vm_area_struct *vma, |
782 | unsigned long address, | 1038 | unsigned long address, |
@@ -883,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
883 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1139 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
884 | { | 1140 | { |
885 | int ret = 0; | 1141 | int ret = 0; |
886 | struct page *page, *new_page; | 1142 | struct page *page = NULL, *new_page; |
887 | unsigned long haddr; | 1143 | unsigned long haddr; |
888 | unsigned long mmun_start; /* For mmu_notifiers */ | 1144 | unsigned long mmun_start; /* For mmu_notifiers */ |
889 | unsigned long mmun_end; /* For mmu_notifiers */ | 1145 | unsigned long mmun_end; /* For mmu_notifiers */ |
890 | 1146 | ||
891 | VM_BUG_ON(!vma->anon_vma); | 1147 | VM_BUG_ON(!vma->anon_vma); |
1148 | haddr = address & HPAGE_PMD_MASK; | ||
1149 | if (is_huge_zero_pmd(orig_pmd)) | ||
1150 | goto alloc; | ||
892 | spin_lock(&mm->page_table_lock); | 1151 | spin_lock(&mm->page_table_lock); |
893 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1152 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
894 | goto out_unlock; | 1153 | goto out_unlock; |
895 | 1154 | ||
896 | page = pmd_page(orig_pmd); | 1155 | page = pmd_page(orig_pmd); |
897 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1156 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); |
898 | haddr = address & HPAGE_PMD_MASK; | ||
899 | if (page_mapcount(page) == 1) { | 1157 | if (page_mapcount(page) == 1) { |
900 | pmd_t entry; | 1158 | pmd_t entry; |
901 | entry = pmd_mkyoung(orig_pmd); | 1159 | entry = pmd_mkyoung(orig_pmd); |
@@ -907,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
907 | } | 1165 | } |
908 | get_page(page); | 1166 | get_page(page); |
909 | spin_unlock(&mm->page_table_lock); | 1167 | spin_unlock(&mm->page_table_lock); |
910 | 1168 | alloc: | |
911 | if (transparent_hugepage_enabled(vma) && | 1169 | if (transparent_hugepage_enabled(vma) && |
912 | !transparent_hugepage_debug_cow()) | 1170 | !transparent_hugepage_debug_cow()) |
913 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1171 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
@@ -917,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
917 | 1175 | ||
918 | if (unlikely(!new_page)) { | 1176 | if (unlikely(!new_page)) { |
919 | count_vm_event(THP_FAULT_FALLBACK); | 1177 | count_vm_event(THP_FAULT_FALLBACK); |
920 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1178 | if (is_huge_zero_pmd(orig_pmd)) { |
921 | pmd, orig_pmd, page, haddr); | 1179 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
922 | if (ret & VM_FAULT_OOM) | 1180 | address, pmd, orig_pmd, haddr); |
923 | split_huge_page(page); | 1181 | } else { |
924 | put_page(page); | 1182 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
1183 | pmd, orig_pmd, page, haddr); | ||
1184 | if (ret & VM_FAULT_OOM) | ||
1185 | split_huge_page(page); | ||
1186 | put_page(page); | ||
1187 | } | ||
925 | goto out; | 1188 | goto out; |
926 | } | 1189 | } |
927 | count_vm_event(THP_FAULT_ALLOC); | 1190 | count_vm_event(THP_FAULT_ALLOC); |
928 | 1191 | ||
929 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1192 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
930 | put_page(new_page); | 1193 | put_page(new_page); |
931 | split_huge_page(page); | 1194 | if (page) { |
932 | put_page(page); | 1195 | split_huge_page(page); |
1196 | put_page(page); | ||
1197 | } | ||
933 | ret |= VM_FAULT_OOM; | 1198 | ret |= VM_FAULT_OOM; |
934 | goto out; | 1199 | goto out; |
935 | } | 1200 | } |
936 | 1201 | ||
937 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1202 | if (is_huge_zero_pmd(orig_pmd)) |
1203 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | ||
1204 | else | ||
1205 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
938 | __SetPageUptodate(new_page); | 1206 | __SetPageUptodate(new_page); |
939 | 1207 | ||
940 | mmun_start = haddr; | 1208 | mmun_start = haddr; |
@@ -942,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
942 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1210 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
943 | 1211 | ||
944 | spin_lock(&mm->page_table_lock); | 1212 | spin_lock(&mm->page_table_lock); |
945 | put_page(page); | 1213 | if (page) |
1214 | put_page(page); | ||
946 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1215 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
947 | spin_unlock(&mm->page_table_lock); | 1216 | spin_unlock(&mm->page_table_lock); |
948 | mem_cgroup_uncharge_page(new_page); | 1217 | mem_cgroup_uncharge_page(new_page); |
@@ -950,16 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
950 | goto out_mn; | 1219 | goto out_mn; |
951 | } else { | 1220 | } else { |
952 | pmd_t entry; | 1221 | pmd_t entry; |
953 | VM_BUG_ON(!PageHead(page)); | 1222 | entry = mk_huge_pmd(new_page, vma); |
954 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
955 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
956 | entry = pmd_mkhuge(entry); | ||
957 | pmdp_clear_flush(vma, haddr, pmd); | 1223 | pmdp_clear_flush(vma, haddr, pmd); |
958 | page_add_new_anon_rmap(new_page, vma, haddr); | 1224 | page_add_new_anon_rmap(new_page, vma, haddr); |
959 | set_pmd_at(mm, haddr, pmd, entry); | 1225 | set_pmd_at(mm, haddr, pmd, entry); |
960 | update_mmu_cache_pmd(vma, address, pmd); | 1226 | update_mmu_cache_pmd(vma, address, pmd); |
961 | page_remove_rmap(page); | 1227 | if (is_huge_zero_pmd(orig_pmd)) { |
962 | put_page(page); | 1228 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1229 | put_huge_zero_page(); | ||
1230 | } else { | ||
1231 | VM_BUG_ON(!PageHead(page)); | ||
1232 | page_remove_rmap(page); | ||
1233 | put_page(page); | ||
1234 | } | ||
963 | ret |= VM_FAULT_WRITE; | 1235 | ret |= VM_FAULT_WRITE; |
964 | } | 1236 | } |
965 | spin_unlock(&mm->page_table_lock); | 1237 | spin_unlock(&mm->page_table_lock); |
@@ -1017,6 +1289,81 @@ out: | |||
1017 | return page; | 1289 | return page; |
1018 | } | 1290 | } |
1019 | 1291 | ||
1292 | /* NUMA hinting page fault entry point for trans huge pmds */ | ||
1293 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1294 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
1295 | { | ||
1296 | struct page *page; | ||
1297 | unsigned long haddr = addr & HPAGE_PMD_MASK; | ||
1298 | int target_nid; | ||
1299 | int current_nid = -1; | ||
1300 | bool migrated; | ||
1301 | bool page_locked = false; | ||
1302 | |||
1303 | spin_lock(&mm->page_table_lock); | ||
1304 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1305 | goto out_unlock; | ||
1306 | |||
1307 | page = pmd_page(pmd); | ||
1308 | get_page(page); | ||
1309 | current_nid = page_to_nid(page); | ||
1310 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
1311 | if (current_nid == numa_node_id()) | ||
1312 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
1313 | |||
1314 | target_nid = mpol_misplaced(page, vma, haddr); | ||
1315 | if (target_nid == -1) { | ||
1316 | put_page(page); | ||
1317 | goto clear_pmdnuma; | ||
1318 | } | ||
1319 | |||
1320 | /* Acquire the page lock to serialise THP migrations */ | ||
1321 | spin_unlock(&mm->page_table_lock); | ||
1322 | lock_page(page); | ||
1323 | page_locked = true; | ||
1324 | |||
1325 | /* Confirm the PTE did not while locked */ | ||
1326 | spin_lock(&mm->page_table_lock); | ||
1327 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1328 | unlock_page(page); | ||
1329 | put_page(page); | ||
1330 | goto out_unlock; | ||
1331 | } | ||
1332 | spin_unlock(&mm->page_table_lock); | ||
1333 | |||
1334 | /* Migrate the THP to the requested node */ | ||
1335 | migrated = migrate_misplaced_transhuge_page(mm, vma, | ||
1336 | pmdp, pmd, addr, | ||
1337 | page, target_nid); | ||
1338 | if (migrated) | ||
1339 | current_nid = target_nid; | ||
1340 | else { | ||
1341 | spin_lock(&mm->page_table_lock); | ||
1342 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1343 | unlock_page(page); | ||
1344 | goto out_unlock; | ||
1345 | } | ||
1346 | goto clear_pmdnuma; | ||
1347 | } | ||
1348 | |||
1349 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1350 | return 0; | ||
1351 | |||
1352 | clear_pmdnuma: | ||
1353 | pmd = pmd_mknonnuma(pmd); | ||
1354 | set_pmd_at(mm, haddr, pmdp, pmd); | ||
1355 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
1356 | update_mmu_cache_pmd(vma, addr, pmdp); | ||
1357 | if (page_locked) | ||
1358 | unlock_page(page); | ||
1359 | |||
1360 | out_unlock: | ||
1361 | spin_unlock(&mm->page_table_lock); | ||
1362 | if (current_nid != -1) | ||
1363 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1364 | return 0; | ||
1365 | } | ||
1366 | |||
1020 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1367 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1021 | pmd_t *pmd, unsigned long addr) | 1368 | pmd_t *pmd, unsigned long addr) |
1022 | { | 1369 | { |
@@ -1028,15 +1375,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1028 | pmd_t orig_pmd; | 1375 | pmd_t orig_pmd; |
1029 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); | 1376 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1030 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1377 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1031 | page = pmd_page(orig_pmd); | ||
1032 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1378 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1033 | page_remove_rmap(page); | 1379 | if (is_huge_zero_pmd(orig_pmd)) { |
1034 | VM_BUG_ON(page_mapcount(page) < 0); | 1380 | tlb->mm->nr_ptes--; |
1035 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1381 | spin_unlock(&tlb->mm->page_table_lock); |
1036 | VM_BUG_ON(!PageHead(page)); | 1382 | put_huge_zero_page(); |
1037 | tlb->mm->nr_ptes--; | 1383 | } else { |
1038 | spin_unlock(&tlb->mm->page_table_lock); | 1384 | page = pmd_page(orig_pmd); |
1039 | tlb_remove_page(tlb, page); | 1385 | page_remove_rmap(page); |
1386 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1387 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1388 | VM_BUG_ON(!PageHead(page)); | ||
1389 | tlb->mm->nr_ptes--; | ||
1390 | spin_unlock(&tlb->mm->page_table_lock); | ||
1391 | tlb_remove_page(tlb, page); | ||
1392 | } | ||
1040 | pte_free(tlb->mm, pgtable); | 1393 | pte_free(tlb->mm, pgtable); |
1041 | ret = 1; | 1394 | ret = 1; |
1042 | } | 1395 | } |
@@ -1099,7 +1452,7 @@ out: | |||
1099 | } | 1452 | } |
1100 | 1453 | ||
1101 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1454 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1102 | unsigned long addr, pgprot_t newprot) | 1455 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1103 | { | 1456 | { |
1104 | struct mm_struct *mm = vma->vm_mm; | 1457 | struct mm_struct *mm = vma->vm_mm; |
1105 | int ret = 0; | 1458 | int ret = 0; |
@@ -1107,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1107 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1460 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1108 | pmd_t entry; | 1461 | pmd_t entry; |
1109 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1462 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1110 | entry = pmd_modify(entry, newprot); | 1463 | if (!prot_numa) { |
1464 | entry = pmd_modify(entry, newprot); | ||
1465 | BUG_ON(pmd_write(entry)); | ||
1466 | } else { | ||
1467 | struct page *page = pmd_page(*pmd); | ||
1468 | |||
1469 | /* only check non-shared pages */ | ||
1470 | if (page_mapcount(page) == 1 && | ||
1471 | !pmd_numa(*pmd)) { | ||
1472 | entry = pmd_mknuma(entry); | ||
1473 | } | ||
1474 | } | ||
1111 | set_pmd_at(mm, addr, pmd, entry); | 1475 | set_pmd_at(mm, addr, pmd, entry); |
1112 | spin_unlock(&vma->vm_mm->page_table_lock); | 1476 | spin_unlock(&vma->vm_mm->page_table_lock); |
1113 | ret = 1; | 1477 | ret = 1; |
@@ -1146,22 +1510,14 @@ pmd_t *page_check_address_pmd(struct page *page, | |||
1146 | unsigned long address, | 1510 | unsigned long address, |
1147 | enum page_check_address_pmd_flag flag) | 1511 | enum page_check_address_pmd_flag flag) |
1148 | { | 1512 | { |
1149 | pgd_t *pgd; | ||
1150 | pud_t *pud; | ||
1151 | pmd_t *pmd, *ret = NULL; | 1513 | pmd_t *pmd, *ret = NULL; |
1152 | 1514 | ||
1153 | if (address & ~HPAGE_PMD_MASK) | 1515 | if (address & ~HPAGE_PMD_MASK) |
1154 | goto out; | 1516 | goto out; |
1155 | 1517 | ||
1156 | pgd = pgd_offset(mm, address); | 1518 | pmd = mm_find_pmd(mm, address); |
1157 | if (!pgd_present(*pgd)) | 1519 | if (!pmd) |
1158 | goto out; | ||
1159 | |||
1160 | pud = pud_offset(pgd, address); | ||
1161 | if (!pud_present(*pud)) | ||
1162 | goto out; | 1520 | goto out; |
1163 | |||
1164 | pmd = pmd_offset(pud, address); | ||
1165 | if (pmd_none(*pmd)) | 1521 | if (pmd_none(*pmd)) |
1166 | goto out; | 1522 | goto out; |
1167 | if (pmd_page(*pmd) != page) | 1523 | if (pmd_page(*pmd) != page) |
@@ -1205,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page, | |||
1205 | * We can't temporarily set the pmd to null in order | 1561 | * We can't temporarily set the pmd to null in order |
1206 | * to split it, the pmd must remain marked huge at all | 1562 | * to split it, the pmd must remain marked huge at all |
1207 | * times or the VM won't take the pmd_trans_huge paths | 1563 | * times or the VM won't take the pmd_trans_huge paths |
1208 | * and it won't wait on the anon_vma->root->mutex to | 1564 | * and it won't wait on the anon_vma->root->rwsem to |
1209 | * serialize against split_huge_page*. | 1565 | * serialize against split_huge_page*. |
1210 | */ | 1566 | */ |
1211 | pmdp_splitting_flush(vma, address, pmd); | 1567 | pmdp_splitting_flush(vma, address, pmd); |
@@ -1296,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1296 | page_tail->mapping = page->mapping; | 1652 | page_tail->mapping = page->mapping; |
1297 | 1653 | ||
1298 | page_tail->index = page->index + i; | 1654 | page_tail->index = page->index + i; |
1655 | page_xchg_last_nid(page_tail, page_last_nid(page)); | ||
1299 | 1656 | ||
1300 | BUG_ON(!PageAnon(page_tail)); | 1657 | BUG_ON(!PageAnon(page_tail)); |
1301 | BUG_ON(!PageUptodate(page_tail)); | 1658 | BUG_ON(!PageUptodate(page_tail)); |
@@ -1363,6 +1720,8 @@ static int __split_huge_page_map(struct page *page, | |||
1363 | BUG_ON(page_mapcount(page) != 1); | 1720 | BUG_ON(page_mapcount(page) != 1); |
1364 | if (!pmd_young(*pmd)) | 1721 | if (!pmd_young(*pmd)) |
1365 | entry = pte_mkold(entry); | 1722 | entry = pte_mkold(entry); |
1723 | if (pmd_numa(*pmd)) | ||
1724 | entry = pte_mknuma(entry); | ||
1366 | pte = pte_offset_map(&_pmd, haddr); | 1725 | pte = pte_offset_map(&_pmd, haddr); |
1367 | BUG_ON(!pte_none(*pte)); | 1726 | BUG_ON(!pte_none(*pte)); |
1368 | set_pte_at(mm, haddr, pte, entry); | 1727 | set_pte_at(mm, haddr, pte, entry); |
@@ -1405,7 +1764,7 @@ static int __split_huge_page_map(struct page *page, | |||
1405 | return ret; | 1764 | return ret; |
1406 | } | 1765 | } |
1407 | 1766 | ||
1408 | /* must be called with anon_vma->root->mutex hold */ | 1767 | /* must be called with anon_vma->root->rwsem held */ |
1409 | static void __split_huge_page(struct page *page, | 1768 | static void __split_huge_page(struct page *page, |
1410 | struct anon_vma *anon_vma) | 1769 | struct anon_vma *anon_vma) |
1411 | { | 1770 | { |
@@ -1458,8 +1817,9 @@ int split_huge_page(struct page *page) | |||
1458 | struct anon_vma *anon_vma; | 1817 | struct anon_vma *anon_vma; |
1459 | int ret = 1; | 1818 | int ret = 1; |
1460 | 1819 | ||
1820 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | ||
1461 | BUG_ON(!PageAnon(page)); | 1821 | BUG_ON(!PageAnon(page)); |
1462 | anon_vma = page_lock_anon_vma(page); | 1822 | anon_vma = page_lock_anon_vma_read(page); |
1463 | if (!anon_vma) | 1823 | if (!anon_vma) |
1464 | goto out; | 1824 | goto out; |
1465 | ret = 0; | 1825 | ret = 0; |
@@ -1472,7 +1832,7 @@ int split_huge_page(struct page *page) | |||
1472 | 1832 | ||
1473 | BUG_ON(PageCompound(page)); | 1833 | BUG_ON(PageCompound(page)); |
1474 | out_unlock: | 1834 | out_unlock: |
1475 | page_unlock_anon_vma(anon_vma); | 1835 | page_unlock_anon_vma_read(anon_vma); |
1476 | out: | 1836 | out: |
1477 | return ret; | 1837 | return ret; |
1478 | } | 1838 | } |
@@ -1701,64 +2061,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) | |||
1701 | } | 2061 | } |
1702 | } | 2062 | } |
1703 | 2063 | ||
1704 | static void release_all_pte_pages(pte_t *pte) | ||
1705 | { | ||
1706 | release_pte_pages(pte, pte + HPAGE_PMD_NR); | ||
1707 | } | ||
1708 | |||
1709 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | 2064 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, |
1710 | unsigned long address, | 2065 | unsigned long address, |
1711 | pte_t *pte) | 2066 | pte_t *pte) |
1712 | { | 2067 | { |
1713 | struct page *page; | 2068 | struct page *page; |
1714 | pte_t *_pte; | 2069 | pte_t *_pte; |
1715 | int referenced = 0, isolated = 0, none = 0; | 2070 | int referenced = 0, none = 0; |
1716 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2071 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
1717 | _pte++, address += PAGE_SIZE) { | 2072 | _pte++, address += PAGE_SIZE) { |
1718 | pte_t pteval = *_pte; | 2073 | pte_t pteval = *_pte; |
1719 | if (pte_none(pteval)) { | 2074 | if (pte_none(pteval)) { |
1720 | if (++none <= khugepaged_max_ptes_none) | 2075 | if (++none <= khugepaged_max_ptes_none) |
1721 | continue; | 2076 | continue; |
1722 | else { | 2077 | else |
1723 | release_pte_pages(pte, _pte); | ||
1724 | goto out; | 2078 | goto out; |
1725 | } | ||
1726 | } | 2079 | } |
1727 | if (!pte_present(pteval) || !pte_write(pteval)) { | 2080 | if (!pte_present(pteval) || !pte_write(pteval)) |
1728 | release_pte_pages(pte, _pte); | ||
1729 | goto out; | 2081 | goto out; |
1730 | } | ||
1731 | page = vm_normal_page(vma, address, pteval); | 2082 | page = vm_normal_page(vma, address, pteval); |
1732 | if (unlikely(!page)) { | 2083 | if (unlikely(!page)) |
1733 | release_pte_pages(pte, _pte); | ||
1734 | goto out; | 2084 | goto out; |
1735 | } | 2085 | |
1736 | VM_BUG_ON(PageCompound(page)); | 2086 | VM_BUG_ON(PageCompound(page)); |
1737 | BUG_ON(!PageAnon(page)); | 2087 | BUG_ON(!PageAnon(page)); |
1738 | VM_BUG_ON(!PageSwapBacked(page)); | 2088 | VM_BUG_ON(!PageSwapBacked(page)); |
1739 | 2089 | ||
1740 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2090 | /* cannot use mapcount: can't collapse if there's a gup pin */ |
1741 | if (page_count(page) != 1) { | 2091 | if (page_count(page) != 1) |
1742 | release_pte_pages(pte, _pte); | ||
1743 | goto out; | 2092 | goto out; |
1744 | } | ||
1745 | /* | 2093 | /* |
1746 | * We can do it before isolate_lru_page because the | 2094 | * We can do it before isolate_lru_page because the |
1747 | * page can't be freed from under us. NOTE: PG_lock | 2095 | * page can't be freed from under us. NOTE: PG_lock |
1748 | * is needed to serialize against split_huge_page | 2096 | * is needed to serialize against split_huge_page |
1749 | * when invoked from the VM. | 2097 | * when invoked from the VM. |
1750 | */ | 2098 | */ |
1751 | if (!trylock_page(page)) { | 2099 | if (!trylock_page(page)) |
1752 | release_pte_pages(pte, _pte); | ||
1753 | goto out; | 2100 | goto out; |
1754 | } | ||
1755 | /* | 2101 | /* |
1756 | * Isolate the page to avoid collapsing an hugepage | 2102 | * Isolate the page to avoid collapsing an hugepage |
1757 | * currently in use by the VM. | 2103 | * currently in use by the VM. |
1758 | */ | 2104 | */ |
1759 | if (isolate_lru_page(page)) { | 2105 | if (isolate_lru_page(page)) { |
1760 | unlock_page(page); | 2106 | unlock_page(page); |
1761 | release_pte_pages(pte, _pte); | ||
1762 | goto out; | 2107 | goto out; |
1763 | } | 2108 | } |
1764 | /* 0 stands for page_is_file_cache(page) == false */ | 2109 | /* 0 stands for page_is_file_cache(page) == false */ |
@@ -1771,12 +2116,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
1771 | mmu_notifier_test_young(vma->vm_mm, address)) | 2116 | mmu_notifier_test_young(vma->vm_mm, address)) |
1772 | referenced = 1; | 2117 | referenced = 1; |
1773 | } | 2118 | } |
1774 | if (unlikely(!referenced)) | 2119 | if (likely(referenced)) |
1775 | release_all_pte_pages(pte); | 2120 | return 1; |
1776 | else | ||
1777 | isolated = 1; | ||
1778 | out: | 2121 | out: |
1779 | return isolated; | 2122 | release_pte_pages(pte, _pte); |
2123 | return 0; | ||
1780 | } | 2124 | } |
1781 | 2125 | ||
1782 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | 2126 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, |
@@ -1918,14 +2262,26 @@ static struct page | |||
1918 | } | 2262 | } |
1919 | #endif | 2263 | #endif |
1920 | 2264 | ||
2265 | static bool hugepage_vma_check(struct vm_area_struct *vma) | ||
2266 | { | ||
2267 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
2268 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
2269 | return false; | ||
2270 | |||
2271 | if (!vma->anon_vma || vma->vm_ops) | ||
2272 | return false; | ||
2273 | if (is_vma_temporary_stack(vma)) | ||
2274 | return false; | ||
2275 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | ||
2276 | return true; | ||
2277 | } | ||
2278 | |||
1921 | static void collapse_huge_page(struct mm_struct *mm, | 2279 | static void collapse_huge_page(struct mm_struct *mm, |
1922 | unsigned long address, | 2280 | unsigned long address, |
1923 | struct page **hpage, | 2281 | struct page **hpage, |
1924 | struct vm_area_struct *vma, | 2282 | struct vm_area_struct *vma, |
1925 | int node) | 2283 | int node) |
1926 | { | 2284 | { |
1927 | pgd_t *pgd; | ||
1928 | pud_t *pud; | ||
1929 | pmd_t *pmd, _pmd; | 2285 | pmd_t *pmd, _pmd; |
1930 | pte_t *pte; | 2286 | pte_t *pte; |
1931 | pgtable_t pgtable; | 2287 | pgtable_t pgtable; |
@@ -1960,31 +2316,15 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1960 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2316 | hend = vma->vm_end & HPAGE_PMD_MASK; |
1961 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | 2317 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) |
1962 | goto out; | 2318 | goto out; |
1963 | 2319 | if (!hugepage_vma_check(vma)) | |
1964 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
1965 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
1966 | goto out; | ||
1967 | |||
1968 | if (!vma->anon_vma || vma->vm_ops) | ||
1969 | goto out; | 2320 | goto out; |
1970 | if (is_vma_temporary_stack(vma)) | 2321 | pmd = mm_find_pmd(mm, address); |
2322 | if (!pmd) | ||
1971 | goto out; | 2323 | goto out; |
1972 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2324 | if (pmd_trans_huge(*pmd)) |
1973 | |||
1974 | pgd = pgd_offset(mm, address); | ||
1975 | if (!pgd_present(*pgd)) | ||
1976 | goto out; | ||
1977 | |||
1978 | pud = pud_offset(pgd, address); | ||
1979 | if (!pud_present(*pud)) | ||
1980 | goto out; | ||
1981 | |||
1982 | pmd = pmd_offset(pud, address); | ||
1983 | /* pmd can't go away or become huge under us */ | ||
1984 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1985 | goto out; | 2325 | goto out; |
1986 | 2326 | ||
1987 | anon_vma_lock(vma->anon_vma); | 2327 | anon_vma_lock_write(vma->anon_vma); |
1988 | 2328 | ||
1989 | pte = pte_offset_map(pmd, address); | 2329 | pte = pte_offset_map(pmd, address); |
1990 | ptl = pte_lockptr(mm, pmd); | 2330 | ptl = pte_lockptr(mm, pmd); |
@@ -2028,9 +2368,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2028 | __SetPageUptodate(new_page); | 2368 | __SetPageUptodate(new_page); |
2029 | pgtable = pmd_pgtable(_pmd); | 2369 | pgtable = pmd_pgtable(_pmd); |
2030 | 2370 | ||
2031 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | 2371 | _pmd = mk_huge_pmd(new_page, vma); |
2032 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
2033 | _pmd = pmd_mkhuge(_pmd); | ||
2034 | 2372 | ||
2035 | /* | 2373 | /* |
2036 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2374 | * spin_lock() below is not the equivalent of smp_wmb(), so |
@@ -2064,8 +2402,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2064 | unsigned long address, | 2402 | unsigned long address, |
2065 | struct page **hpage) | 2403 | struct page **hpage) |
2066 | { | 2404 | { |
2067 | pgd_t *pgd; | ||
2068 | pud_t *pud; | ||
2069 | pmd_t *pmd; | 2405 | pmd_t *pmd; |
2070 | pte_t *pte, *_pte; | 2406 | pte_t *pte, *_pte; |
2071 | int ret = 0, referenced = 0, none = 0; | 2407 | int ret = 0, referenced = 0, none = 0; |
@@ -2076,16 +2412,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2076 | 2412 | ||
2077 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2413 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2078 | 2414 | ||
2079 | pgd = pgd_offset(mm, address); | 2415 | pmd = mm_find_pmd(mm, address); |
2080 | if (!pgd_present(*pgd)) | 2416 | if (!pmd) |
2081 | goto out; | 2417 | goto out; |
2082 | 2418 | if (pmd_trans_huge(*pmd)) | |
2083 | pud = pud_offset(pgd, address); | ||
2084 | if (!pud_present(*pud)) | ||
2085 | goto out; | ||
2086 | |||
2087 | pmd = pmd_offset(pud, address); | ||
2088 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
2089 | goto out; | 2419 | goto out; |
2090 | 2420 | ||
2091 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2421 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
@@ -2193,20 +2523,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2193 | progress++; | 2523 | progress++; |
2194 | break; | 2524 | break; |
2195 | } | 2525 | } |
2196 | 2526 | if (!hugepage_vma_check(vma)) { | |
2197 | if ((!(vma->vm_flags & VM_HUGEPAGE) && | 2527 | skip: |
2198 | !khugepaged_always()) || | ||
2199 | (vma->vm_flags & VM_NOHUGEPAGE)) { | ||
2200 | skip: | ||
2201 | progress++; | 2528 | progress++; |
2202 | continue; | 2529 | continue; |
2203 | } | 2530 | } |
2204 | if (!vma->anon_vma || vma->vm_ops) | ||
2205 | goto skip; | ||
2206 | if (is_vma_temporary_stack(vma)) | ||
2207 | goto skip; | ||
2208 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | ||
2209 | |||
2210 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2531 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2211 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2532 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2212 | if (hstart >= hend) | 2533 | if (hstart >= hend) |
@@ -2356,19 +2677,65 @@ static int khugepaged(void *none) | |||
2356 | return 0; | 2677 | return 0; |
2357 | } | 2678 | } |
2358 | 2679 | ||
2359 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | 2680 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
2681 | unsigned long haddr, pmd_t *pmd) | ||
2682 | { | ||
2683 | struct mm_struct *mm = vma->vm_mm; | ||
2684 | pgtable_t pgtable; | ||
2685 | pmd_t _pmd; | ||
2686 | int i; | ||
2687 | |||
2688 | pmdp_clear_flush(vma, haddr, pmd); | ||
2689 | /* leave pmd empty until pte is filled */ | ||
2690 | |||
2691 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
2692 | pmd_populate(mm, &_pmd, pgtable); | ||
2693 | |||
2694 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
2695 | pte_t *pte, entry; | ||
2696 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
2697 | entry = pte_mkspecial(entry); | ||
2698 | pte = pte_offset_map(&_pmd, haddr); | ||
2699 | VM_BUG_ON(!pte_none(*pte)); | ||
2700 | set_pte_at(mm, haddr, pte, entry); | ||
2701 | pte_unmap(pte); | ||
2702 | } | ||
2703 | smp_wmb(); /* make pte visible before pmd */ | ||
2704 | pmd_populate(mm, pmd, pgtable); | ||
2705 | put_huge_zero_page(); | ||
2706 | } | ||
2707 | |||
2708 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | ||
2709 | pmd_t *pmd) | ||
2360 | { | 2710 | { |
2361 | struct page *page; | 2711 | struct page *page; |
2712 | struct mm_struct *mm = vma->vm_mm; | ||
2713 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
2714 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2715 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2716 | |||
2717 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | ||
2362 | 2718 | ||
2719 | mmun_start = haddr; | ||
2720 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
2721 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2363 | spin_lock(&mm->page_table_lock); | 2722 | spin_lock(&mm->page_table_lock); |
2364 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2723 | if (unlikely(!pmd_trans_huge(*pmd))) { |
2365 | spin_unlock(&mm->page_table_lock); | 2724 | spin_unlock(&mm->page_table_lock); |
2725 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2726 | return; | ||
2727 | } | ||
2728 | if (is_huge_zero_pmd(*pmd)) { | ||
2729 | __split_huge_zero_page_pmd(vma, haddr, pmd); | ||
2730 | spin_unlock(&mm->page_table_lock); | ||
2731 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2366 | return; | 2732 | return; |
2367 | } | 2733 | } |
2368 | page = pmd_page(*pmd); | 2734 | page = pmd_page(*pmd); |
2369 | VM_BUG_ON(!page_count(page)); | 2735 | VM_BUG_ON(!page_count(page)); |
2370 | get_page(page); | 2736 | get_page(page); |
2371 | spin_unlock(&mm->page_table_lock); | 2737 | spin_unlock(&mm->page_table_lock); |
2738 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2372 | 2739 | ||
2373 | split_huge_page(page); | 2740 | split_huge_page(page); |
2374 | 2741 | ||
@@ -2376,31 +2743,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | |||
2376 | BUG_ON(pmd_trans_huge(*pmd)); | 2743 | BUG_ON(pmd_trans_huge(*pmd)); |
2377 | } | 2744 | } |
2378 | 2745 | ||
2746 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
2747 | pmd_t *pmd) | ||
2748 | { | ||
2749 | struct vm_area_struct *vma; | ||
2750 | |||
2751 | vma = find_vma(mm, address); | ||
2752 | BUG_ON(vma == NULL); | ||
2753 | split_huge_page_pmd(vma, address, pmd); | ||
2754 | } | ||
2755 | |||
2379 | static void split_huge_page_address(struct mm_struct *mm, | 2756 | static void split_huge_page_address(struct mm_struct *mm, |
2380 | unsigned long address) | 2757 | unsigned long address) |
2381 | { | 2758 | { |
2382 | pgd_t *pgd; | ||
2383 | pud_t *pud; | ||
2384 | pmd_t *pmd; | 2759 | pmd_t *pmd; |
2385 | 2760 | ||
2386 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | 2761 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); |
2387 | 2762 | ||
2388 | pgd = pgd_offset(mm, address); | 2763 | pmd = mm_find_pmd(mm, address); |
2389 | if (!pgd_present(*pgd)) | 2764 | if (!pmd) |
2390 | return; | ||
2391 | |||
2392 | pud = pud_offset(pgd, address); | ||
2393 | if (!pud_present(*pud)) | ||
2394 | return; | ||
2395 | |||
2396 | pmd = pmd_offset(pud, address); | ||
2397 | if (!pmd_present(*pmd)) | ||
2398 | return; | 2765 | return; |
2399 | /* | 2766 | /* |
2400 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2767 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
2401 | * materialize from under us. | 2768 | * materialize from under us. |
2402 | */ | 2769 | */ |
2403 | split_huge_page_pmd(mm, pmd); | 2770 | split_huge_page_pmd_mm(mm, address, pmd); |
2404 | } | 2771 | } |
2405 | 2772 | ||
2406 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 2773 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59a0059b39e2..4f3ea0b1e57c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Generic hugetlb support. | 2 | * Generic hugetlb support. |
3 | * (C) William Irwin, April 2004 | 3 | * (C) Nadia Yvette Chambers, April 2004 |
4 | */ | 4 | */ |
5 | #include <linux/list.h> | 5 | #include <linux/list.h> |
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
1057 | * on-line nodes with memory and will handle the hstate accounting. | 1057 | * on-line nodes with memory and will handle the hstate accounting. |
1058 | */ | 1058 | */ |
1059 | while (nr_pages--) { | 1059 | while (nr_pages--) { |
1060 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) | 1060 | if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) |
1061 | break; | 1061 | break; |
1062 | } | 1062 | } |
1063 | } | 1063 | } |
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1181 | { | 1181 | { |
1182 | struct huge_bootmem_page *m; | 1182 | struct huge_bootmem_page *m; |
1183 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 1183 | int nr_nodes = nodes_weight(node_states[N_MEMORY]); |
1184 | 1184 | ||
1185 | while (nr_nodes) { | 1185 | while (nr_nodes) { |
1186 | void *addr; | 1186 | void *addr; |
1187 | 1187 | ||
1188 | addr = __alloc_bootmem_node_nopanic( | 1188 | addr = __alloc_bootmem_node_nopanic( |
1189 | NODE_DATA(hstate_next_node_to_alloc(h, | 1189 | NODE_DATA(hstate_next_node_to_alloc(h, |
1190 | &node_states[N_HIGH_MEMORY])), | 1190 | &node_states[N_MEMORY])), |
1191 | huge_page_size(h), huge_page_size(h), 0); | 1191 | huge_page_size(h), huge_page_size(h), 0); |
1192 | 1192 | ||
1193 | if (addr) { | 1193 | if (addr) { |
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1259 | if (!alloc_bootmem_huge_page(h)) | 1259 | if (!alloc_bootmem_huge_page(h)) |
1260 | break; | 1260 | break; |
1261 | } else if (!alloc_fresh_huge_page(h, | 1261 | } else if (!alloc_fresh_huge_page(h, |
1262 | &node_states[N_HIGH_MEMORY])) | 1262 | &node_states[N_MEMORY])) |
1263 | break; | 1263 | break; |
1264 | } | 1264 | } |
1265 | h->max_huge_pages = i; | 1265 | h->max_huge_pages = i; |
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1527 | if (!(obey_mempolicy && | 1527 | if (!(obey_mempolicy && |
1528 | init_nodemask_of_mempolicy(nodes_allowed))) { | 1528 | init_nodemask_of_mempolicy(nodes_allowed))) { |
1529 | NODEMASK_FREE(nodes_allowed); | 1529 | NODEMASK_FREE(nodes_allowed); |
1530 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1530 | nodes_allowed = &node_states[N_MEMORY]; |
1531 | } | 1531 | } |
1532 | } else if (nodes_allowed) { | 1532 | } else if (nodes_allowed) { |
1533 | /* | 1533 | /* |
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | 1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; |
1538 | init_nodemask_of_node(nodes_allowed, nid); | 1538 | init_nodemask_of_node(nodes_allowed, nid); |
1539 | } else | 1539 | } else |
1540 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1540 | nodes_allowed = &node_states[N_MEMORY]; |
1541 | 1541 | ||
1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | 1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); |
1543 | 1543 | ||
1544 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1544 | if (nodes_allowed != &node_states[N_MEMORY]) |
1545 | NODEMASK_FREE(nodes_allowed); | 1545 | NODEMASK_FREE(nodes_allowed); |
1546 | 1546 | ||
1547 | return len; | 1547 | return len; |
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void) | |||
1800 | * remove hstate attributes from any nodes that have them. | 1800 | * remove hstate attributes from any nodes that have them. |
1801 | */ | 1801 | */ |
1802 | for (nid = 0; nid < nr_node_ids; nid++) | 1802 | for (nid = 0; nid < nr_node_ids; nid++) |
1803 | hugetlb_unregister_node(&node_devices[nid]); | 1803 | hugetlb_unregister_node(node_devices[nid]); |
1804 | } | 1804 | } |
1805 | 1805 | ||
1806 | /* | 1806 | /* |
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void) | |||
1844 | { | 1844 | { |
1845 | int nid; | 1845 | int nid; |
1846 | 1846 | ||
1847 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1847 | for_each_node_state(nid, N_MEMORY) { |
1848 | struct node *node = &node_devices[nid]; | 1848 | struct node *node = node_devices[nid]; |
1849 | if (node->dev.id == nid) | 1849 | if (node->dev.id == nid) |
1850 | hugetlb_register_node(node); | 1850 | hugetlb_register_node(node); |
1851 | } | 1851 | } |
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void) | |||
1906 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | 1906 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
1907 | 1907 | ||
1908 | hugetlb_init_hstates(); | 1908 | hugetlb_init_hstates(); |
1909 | |||
1910 | gather_bootmem_prealloc(); | 1909 | gather_bootmem_prealloc(); |
1911 | |||
1912 | report_hugepages(); | 1910 | report_hugepages(); |
1913 | 1911 | ||
1914 | hugetlb_sysfs_init(); | 1912 | hugetlb_sysfs_init(); |
1915 | |||
1916 | hugetlb_register_all_nodes(); | 1913 | hugetlb_register_all_nodes(); |
1914 | hugetlb_cgroup_file_init(); | ||
1917 | 1915 | ||
1918 | return 0; | 1916 | return 0; |
1919 | } | 1917 | } |
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1939 | for (i = 0; i < MAX_NUMNODES; ++i) | 1937 | for (i = 0; i < MAX_NUMNODES; ++i) |
1940 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1938 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1941 | INIT_LIST_HEAD(&h->hugepage_activelist); | 1939 | INIT_LIST_HEAD(&h->hugepage_activelist); |
1942 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1940 | h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); |
1943 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1941 | h->next_nid_to_free = first_node(node_states[N_MEMORY]); |
1944 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1942 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1945 | huge_page_size(h)/1024); | 1943 | huge_page_size(h)/1024); |
1946 | /* | ||
1947 | * Add cgroup control files only if the huge page consists | ||
1948 | * of more than two normal pages. This is because we use | ||
1949 | * page[2].lru.next for storing cgoup details. | ||
1950 | */ | ||
1951 | if (order >= HUGETLB_CGROUP_MIN_ORDER) | ||
1952 | hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); | ||
1953 | 1944 | ||
1954 | parsed_hstate = h; | 1945 | parsed_hstate = h; |
1955 | } | 1946 | } |
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
2035 | if (!(obey_mempolicy && | 2026 | if (!(obey_mempolicy && |
2036 | init_nodemask_of_mempolicy(nodes_allowed))) { | 2027 | init_nodemask_of_mempolicy(nodes_allowed))) { |
2037 | NODEMASK_FREE(nodes_allowed); | 2028 | NODEMASK_FREE(nodes_allowed); |
2038 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 2029 | nodes_allowed = &node_states[N_MEMORY]; |
2039 | } | 2030 | } |
2040 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | 2031 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); |
2041 | 2032 | ||
2042 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 2033 | if (nodes_allowed != &node_states[N_MEMORY]) |
2043 | NODEMASK_FREE(nodes_allowed); | 2034 | NODEMASK_FREE(nodes_allowed); |
2044 | } | 2035 | } |
2045 | out: | 2036 | out: |
@@ -2386,8 +2377,10 @@ again: | |||
2386 | /* | 2377 | /* |
2387 | * HWPoisoned hugepage is already unmapped and dropped reference | 2378 | * HWPoisoned hugepage is already unmapped and dropped reference |
2388 | */ | 2379 | */ |
2389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) | 2380 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { |
2381 | pte_clear(mm, address, ptep); | ||
2390 | continue; | 2382 | continue; |
2383 | } | ||
2391 | 2384 | ||
2392 | page = pte_page(pte); | 2385 | page = pte_page(pte); |
2393 | /* | 2386 | /* |
@@ -3014,7 +3007,7 @@ same_page: | |||
3014 | return i ? i : -EFAULT; | 3007 | return i ? i : -EFAULT; |
3015 | } | 3008 | } |
3016 | 3009 | ||
3017 | void hugetlb_change_protection(struct vm_area_struct *vma, | 3010 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
3018 | unsigned long address, unsigned long end, pgprot_t newprot) | 3011 | unsigned long address, unsigned long end, pgprot_t newprot) |
3019 | { | 3012 | { |
3020 | struct mm_struct *mm = vma->vm_mm; | 3013 | struct mm_struct *mm = vma->vm_mm; |
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3022 | pte_t *ptep; | 3015 | pte_t *ptep; |
3023 | pte_t pte; | 3016 | pte_t pte; |
3024 | struct hstate *h = hstate_vma(vma); | 3017 | struct hstate *h = hstate_vma(vma); |
3018 | unsigned long pages = 0; | ||
3025 | 3019 | ||
3026 | BUG_ON(address >= end); | 3020 | BUG_ON(address >= end); |
3027 | flush_cache_range(vma, address, end); | 3021 | flush_cache_range(vma, address, end); |
@@ -3032,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3032 | ptep = huge_pte_offset(mm, address); | 3026 | ptep = huge_pte_offset(mm, address); |
3033 | if (!ptep) | 3027 | if (!ptep) |
3034 | continue; | 3028 | continue; |
3035 | if (huge_pmd_unshare(mm, &address, ptep)) | 3029 | if (huge_pmd_unshare(mm, &address, ptep)) { |
3030 | pages++; | ||
3036 | continue; | 3031 | continue; |
3032 | } | ||
3037 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3033 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
3038 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3034 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3039 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 3035 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
3040 | set_huge_pte_at(mm, address, ptep, pte); | 3036 | set_huge_pte_at(mm, address, ptep, pte); |
3037 | pages++; | ||
3041 | } | 3038 | } |
3042 | } | 3039 | } |
3043 | spin_unlock(&mm->page_table_lock); | 3040 | spin_unlock(&mm->page_table_lock); |
@@ -3049,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3049 | */ | 3046 | */ |
3050 | flush_tlb_range(vma, start, end); | 3047 | flush_tlb_range(vma, start, end); |
3051 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3048 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
3049 | |||
3050 | return pages << h->order; | ||
3052 | } | 3051 | } |
3053 | 3052 | ||
3054 | int hugetlb_reserve_pages(struct inode *inode, | 3053 | int hugetlb_reserve_pages(struct inode *inode, |
@@ -3170,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
3170 | 3169 | ||
3171 | spin_lock(&hugetlb_lock); | 3170 | spin_lock(&hugetlb_lock); |
3172 | if (is_hugepage_on_freelist(hpage)) { | 3171 | if (is_hugepage_on_freelist(hpage)) { |
3173 | list_del(&hpage->lru); | 3172 | /* |
3173 | * Hwpoisoned hugepage isn't linked to activelist or freelist, | ||
3174 | * but dangling hpage->lru can trigger list-debug warnings | ||
3175 | * (this happens when we call unpoison_memory() on it), | ||
3176 | * so let it point to itself with list_del_init(). | ||
3177 | */ | ||
3178 | list_del_init(&hpage->lru); | ||
3174 | set_page_refcounted(hpage); | 3179 | set_page_refcounted(hpage); |
3175 | h->free_huge_pages--; | 3180 | h->free_huge_pages--; |
3176 | h->free_huge_pages_node[nid]--; | 3181 | h->free_huge_pages_node[nid]--; |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a3f358fb8a0c..9cea7de22ffb 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) | |||
77 | return false; | 77 | return false; |
78 | } | 78 | } |
79 | 79 | ||
80 | static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | 80 | static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) |
81 | { | 81 | { |
82 | int idx; | 82 | int idx; |
83 | struct cgroup *parent_cgroup; | 83 | struct cgroup *parent_cgroup; |
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | |||
101 | return &h_cgroup->css; | 101 | return &h_cgroup->css; |
102 | } | 102 | } |
103 | 103 | ||
104 | static void hugetlb_cgroup_destroy(struct cgroup *cgroup) | 104 | static void hugetlb_cgroup_css_free(struct cgroup *cgroup) |
105 | { | 105 | { |
106 | struct hugetlb_cgroup *h_cgroup; | 106 | struct hugetlb_cgroup *h_cgroup; |
107 | 107 | ||
@@ -155,18 +155,13 @@ out: | |||
155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | 155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to |
156 | * the parent cgroup. | 156 | * the parent cgroup. |
157 | */ | 157 | */ |
158 | static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | 158 | static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) |
159 | { | 159 | { |
160 | struct hstate *h; | 160 | struct hstate *h; |
161 | struct page *page; | 161 | struct page *page; |
162 | int ret = 0, idx = 0; | 162 | int idx = 0; |
163 | 163 | ||
164 | do { | 164 | do { |
165 | if (cgroup_task_count(cgroup) || | ||
166 | !list_empty(&cgroup->children)) { | ||
167 | ret = -EBUSY; | ||
168 | goto out; | ||
169 | } | ||
170 | for_each_hstate(h) { | 165 | for_each_hstate(h) { |
171 | spin_lock(&hugetlb_lock); | 166 | spin_lock(&hugetlb_lock); |
172 | list_for_each_entry(page, &h->hugepage_activelist, lru) | 167 | list_for_each_entry(page, &h->hugepage_activelist, lru) |
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | |||
177 | } | 172 | } |
178 | cond_resched(); | 173 | cond_resched(); |
179 | } while (hugetlb_cgroup_have_usage(cgroup)); | 174 | } while (hugetlb_cgroup_have_usage(cgroup)); |
180 | out: | ||
181 | return ret; | ||
182 | } | 175 | } |
183 | 176 | ||
184 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | 177 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, |
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) | |||
340 | return buf; | 333 | return buf; |
341 | } | 334 | } |
342 | 335 | ||
343 | int __init hugetlb_cgroup_file_init(int idx) | 336 | static void __init __hugetlb_cgroup_file_init(int idx) |
344 | { | 337 | { |
345 | char buf[32]; | 338 | char buf[32]; |
346 | struct cftype *cft; | 339 | struct cftype *cft; |
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx) | |||
382 | 375 | ||
383 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); | 376 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); |
384 | 377 | ||
385 | return 0; | 378 | return; |
379 | } | ||
380 | |||
381 | void __init hugetlb_cgroup_file_init(void) | ||
382 | { | ||
383 | struct hstate *h; | ||
384 | |||
385 | for_each_hstate(h) { | ||
386 | /* | ||
387 | * Add cgroup control files only if the huge page consists | ||
388 | * of more than two normal pages. This is because we use | ||
389 | * page[2].lru.next for storing cgroup details. | ||
390 | */ | ||
391 | if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) | ||
392 | __hugetlb_cgroup_file_init(hstate_index(h)); | ||
393 | } | ||
386 | } | 394 | } |
387 | 395 | ||
388 | /* | 396 | /* |
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | |||
411 | 419 | ||
412 | struct cgroup_subsys hugetlb_subsys = { | 420 | struct cgroup_subsys hugetlb_subsys = { |
413 | .name = "hugetlb", | 421 | .name = "hugetlb", |
414 | .create = hugetlb_cgroup_create, | 422 | .css_alloc = hugetlb_cgroup_css_alloc, |
415 | .pre_destroy = hugetlb_cgroup_pre_destroy, | 423 | .css_offline = hugetlb_cgroup_css_offline, |
416 | .destroy = hugetlb_cgroup_destroy, | 424 | .css_free = hugetlb_cgroup_css_free, |
417 | .subsys_id = hugetlb_subsys_id, | 425 | .subsys_id = hugetlb_subsys_id, |
418 | }; | 426 | }; |
diff --git a/mm/internal.h b/mm/internal.h index a4fa284f6bc2..d597f94cc205 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page); | |||
92 | extern void putback_lru_page(struct page *page); | 92 | extern void putback_lru_page(struct page *page); |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * in mm/rmap.c: | ||
96 | */ | ||
97 | extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); | ||
98 | |||
99 | /* | ||
95 | * in mm/page_alloc.c | 100 | * in mm/page_alloc.c |
96 | */ | 101 | */ |
97 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 102 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
@@ -212,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
212 | { | 217 | { |
213 | if (TestClearPageMlocked(page)) { | 218 | if (TestClearPageMlocked(page)) { |
214 | unsigned long flags; | 219 | unsigned long flags; |
220 | int nr_pages = hpage_nr_pages(page); | ||
215 | 221 | ||
216 | local_irq_save(flags); | 222 | local_irq_save(flags); |
217 | __dec_zone_page_state(page, NR_MLOCK); | 223 | __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
218 | SetPageMlocked(newpage); | 224 | SetPageMlocked(newpage); |
219 | __inc_zone_page_state(newpage, NR_MLOCK); | 225 | __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); |
220 | local_irq_restore(flags); | 226 | local_irq_restore(flags); |
221 | } | 227 | } |
222 | } | 228 | } |
223 | 229 | ||
230 | extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); | ||
231 | |||
224 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 232 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
225 | extern unsigned long vma_address(struct page *page, | 233 | extern unsigned long vma_address(struct page *page, |
226 | struct vm_area_struct *vma); | 234 | struct vm_area_struct *vma); |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a217cc544060..752a705c77c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str) | |||
1556 | struct kmemleak_object *object; | 1556 | struct kmemleak_object *object; |
1557 | unsigned long addr; | 1557 | unsigned long addr; |
1558 | 1558 | ||
1559 | addr= simple_strtoul(str, NULL, 0); | 1559 | if (kstrtoul(str, 0, &addr)) |
1560 | return -EINVAL; | ||
1560 | object = find_and_get_object(addr, 0); | 1561 | object = find_and_get_object(addr, 0); |
1561 | if (!object) { | 1562 | if (!object) { |
1562 | pr_info("Unknown object at 0x%08lx\n", addr); | 1563 | pr_info("Unknown object at 0x%08lx\n", addr); |
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
778 | struct page *kpage, pte_t orig_pte) | 778 | struct page *kpage, pte_t orig_pte) |
779 | { | 779 | { |
780 | struct mm_struct *mm = vma->vm_mm; | 780 | struct mm_struct *mm = vma->vm_mm; |
781 | pgd_t *pgd; | ||
782 | pud_t *pud; | ||
783 | pmd_t *pmd; | 781 | pmd_t *pmd; |
784 | pte_t *ptep; | 782 | pte_t *ptep; |
785 | spinlock_t *ptl; | 783 | spinlock_t *ptl; |
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
792 | if (addr == -EFAULT) | 790 | if (addr == -EFAULT) |
793 | goto out; | 791 | goto out; |
794 | 792 | ||
795 | pgd = pgd_offset(mm, addr); | 793 | pmd = mm_find_pmd(mm, addr); |
796 | if (!pgd_present(*pgd)) | 794 | if (!pmd) |
797 | goto out; | 795 | goto out; |
798 | |||
799 | pud = pud_offset(pgd, addr); | ||
800 | if (!pud_present(*pud)) | ||
801 | goto out; | ||
802 | |||
803 | pmd = pmd_offset(pud, addr); | ||
804 | BUG_ON(pmd_trans_huge(*pmd)); | 796 | BUG_ON(pmd_trans_huge(*pmd)); |
805 | if (!pmd_present(*pmd)) | ||
806 | goto out; | ||
807 | 797 | ||
808 | mmun_start = addr; | 798 | mmun_start = addr; |
809 | mmun_end = addr + PAGE_SIZE; | 799 | mmun_end = addr + PAGE_SIZE; |
@@ -1634,7 +1624,7 @@ again: | |||
1634 | struct anon_vma_chain *vmac; | 1624 | struct anon_vma_chain *vmac; |
1635 | struct vm_area_struct *vma; | 1625 | struct vm_area_struct *vma; |
1636 | 1626 | ||
1637 | anon_vma_lock(anon_vma); | 1627 | anon_vma_lock_read(anon_vma); |
1638 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1628 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1639 | 0, ULONG_MAX) { | 1629 | 0, ULONG_MAX) { |
1640 | vma = vmac->vma; | 1630 | vma = vmac->vma; |
@@ -1658,7 +1648,7 @@ again: | |||
1658 | if (!search_new_forks || !mapcount) | 1648 | if (!search_new_forks || !mapcount) |
1659 | break; | 1649 | break; |
1660 | } | 1650 | } |
1661 | anon_vma_unlock(anon_vma); | 1651 | anon_vma_unlock_read(anon_vma); |
1662 | if (!mapcount) | 1652 | if (!mapcount) |
1663 | goto out; | 1653 | goto out; |
1664 | } | 1654 | } |
@@ -1688,7 +1678,7 @@ again: | |||
1688 | struct anon_vma_chain *vmac; | 1678 | struct anon_vma_chain *vmac; |
1689 | struct vm_area_struct *vma; | 1679 | struct vm_area_struct *vma; |
1690 | 1680 | ||
1691 | anon_vma_lock(anon_vma); | 1681 | anon_vma_lock_read(anon_vma); |
1692 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1682 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1693 | 0, ULONG_MAX) { | 1683 | 0, ULONG_MAX) { |
1694 | vma = vmac->vma; | 1684 | vma = vmac->vma; |
@@ -1707,11 +1697,11 @@ again: | |||
1707 | ret = try_to_unmap_one(page, vma, | 1697 | ret = try_to_unmap_one(page, vma, |
1708 | rmap_item->address, flags); | 1698 | rmap_item->address, flags); |
1709 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | 1699 | if (ret != SWAP_AGAIN || !page_mapped(page)) { |
1710 | anon_vma_unlock(anon_vma); | 1700 | anon_vma_unlock_read(anon_vma); |
1711 | goto out; | 1701 | goto out; |
1712 | } | 1702 | } |
1713 | } | 1703 | } |
1714 | anon_vma_unlock(anon_vma); | 1704 | anon_vma_unlock_read(anon_vma); |
1715 | } | 1705 | } |
1716 | if (!search_new_forks++) | 1706 | if (!search_new_forks++) |
1717 | goto again; | 1707 | goto again; |
@@ -1741,7 +1731,7 @@ again: | |||
1741 | struct anon_vma_chain *vmac; | 1731 | struct anon_vma_chain *vmac; |
1742 | struct vm_area_struct *vma; | 1732 | struct vm_area_struct *vma; |
1743 | 1733 | ||
1744 | anon_vma_lock(anon_vma); | 1734 | anon_vma_lock_read(anon_vma); |
1745 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1735 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1746 | 0, ULONG_MAX) { | 1736 | 0, ULONG_MAX) { |
1747 | vma = vmac->vma; | 1737 | vma = vmac->vma; |
@@ -1759,11 +1749,11 @@ again: | |||
1759 | 1749 | ||
1760 | ret = rmap_one(page, vma, rmap_item->address, arg); | 1750 | ret = rmap_one(page, vma, rmap_item->address, arg); |
1761 | if (ret != SWAP_AGAIN) { | 1751 | if (ret != SWAP_AGAIN) { |
1762 | anon_vma_unlock(anon_vma); | 1752 | anon_vma_unlock_read(anon_vma); |
1763 | goto out; | 1753 | goto out; |
1764 | } | 1754 | } |
1765 | } | 1755 | } |
1766 | anon_vma_unlock(anon_vma); | 1756 | anon_vma_unlock_read(anon_vma); |
1767 | } | 1757 | } |
1768 | if (!search_new_forks++) | 1758 | if (!search_new_forks++) |
1769 | goto again; | 1759 | goto again; |
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1929 | if (ksm_run != flags) { | 1919 | if (ksm_run != flags) { |
1930 | ksm_run = flags; | 1920 | ksm_run = flags; |
1931 | if (flags & KSM_RUN_UNMERGE) { | 1921 | if (flags & KSM_RUN_UNMERGE) { |
1932 | int oom_score_adj; | 1922 | set_current_oom_origin(); |
1933 | |||
1934 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | ||
1935 | err = unmerge_and_remove_all_rmap_items(); | 1923 | err = unmerge_and_remove_all_rmap_items(); |
1936 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, | 1924 | clear_current_oom_origin(); |
1937 | oom_score_adj); | ||
1938 | if (err) { | 1925 | if (err) { |
1939 | ksm_run = KSM_RUN_STOP; | 1926 | ksm_run = KSM_RUN_STOP; |
1940 | count = err; | 1927 | count = err; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b31..f3009b4bae51 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -10,6 +10,10 @@ | |||
10 | * Copyright (C) 2009 Nokia Corporation | 10 | * Copyright (C) 2009 Nokia Corporation |
11 | * Author: Kirill A. Shutemov | 11 | * Author: Kirill A. Shutemov |
12 | * | 12 | * |
13 | * Kernel Memory Controller | ||
14 | * Copyright (C) 2012 Parallels Inc. and Google Inc. | ||
15 | * Authors: Glauber Costa and Suleiman Souhlal | ||
16 | * | ||
13 | * This program is free software; you can redistribute it and/or modify | 17 | * This program is free software; you can redistribute it and/or modify |
14 | * it under the terms of the GNU General Public License as published by | 18 | * it under the terms of the GNU General Public License as published by |
15 | * the Free Software Foundation; either version 2 of the License, or | 19 | * the Free Software Foundation; either version 2 of the License, or |
@@ -59,6 +63,8 @@ | |||
59 | #include <trace/events/vmscan.h> | 63 | #include <trace/events/vmscan.h> |
60 | 64 | ||
61 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 65 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
66 | EXPORT_SYMBOL(mem_cgroup_subsys); | ||
67 | |||
62 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 68 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
63 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 69 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
64 | 70 | ||
@@ -266,6 +272,10 @@ struct mem_cgroup { | |||
266 | }; | 272 | }; |
267 | 273 | ||
268 | /* | 274 | /* |
275 | * the counter to account for kernel memory usage. | ||
276 | */ | ||
277 | struct res_counter kmem; | ||
278 | /* | ||
269 | * Per cgroup active and inactive list, similar to the | 279 | * Per cgroup active and inactive list, similar to the |
270 | * per zone LRU lists. | 280 | * per zone LRU lists. |
271 | */ | 281 | */ |
@@ -280,6 +290,7 @@ struct mem_cgroup { | |||
280 | * Should the accounting and control be hierarchical, per subtree? | 290 | * Should the accounting and control be hierarchical, per subtree? |
281 | */ | 291 | */ |
282 | bool use_hierarchy; | 292 | bool use_hierarchy; |
293 | unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ | ||
283 | 294 | ||
284 | bool oom_lock; | 295 | bool oom_lock; |
285 | atomic_t under_oom; | 296 | atomic_t under_oom; |
@@ -330,8 +341,61 @@ struct mem_cgroup { | |||
330 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | 341 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
331 | struct tcp_memcontrol tcp_mem; | 342 | struct tcp_memcontrol tcp_mem; |
332 | #endif | 343 | #endif |
344 | #if defined(CONFIG_MEMCG_KMEM) | ||
345 | /* analogous to slab_common's slab_caches list. per-memcg */ | ||
346 | struct list_head memcg_slab_caches; | ||
347 | /* Not a spinlock, we can take a lot of time walking the list */ | ||
348 | struct mutex slab_caches_mutex; | ||
349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | ||
350 | int kmemcg_id; | ||
351 | #endif | ||
333 | }; | 352 | }; |
334 | 353 | ||
354 | /* internal only representation about the status of kmem accounting. */ | ||
355 | enum { | ||
356 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ | ||
357 | KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ | ||
358 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ | ||
359 | }; | ||
360 | |||
361 | /* We account when limit is on, but only after call sites are patched */ | ||
362 | #define KMEM_ACCOUNTED_MASK \ | ||
363 | ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) | ||
364 | |||
365 | #ifdef CONFIG_MEMCG_KMEM | ||
366 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | ||
367 | { | ||
368 | set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | ||
369 | } | ||
370 | |||
371 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | ||
372 | { | ||
373 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | ||
374 | } | ||
375 | |||
376 | static void memcg_kmem_set_activated(struct mem_cgroup *memcg) | ||
377 | { | ||
378 | set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
379 | } | ||
380 | |||
381 | static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) | ||
382 | { | ||
383 | clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); | ||
384 | } | ||
385 | |||
386 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | ||
387 | { | ||
388 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) | ||
389 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); | ||
390 | } | ||
391 | |||
392 | static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | ||
393 | { | ||
394 | return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, | ||
395 | &memcg->kmem_account_flags); | ||
396 | } | ||
397 | #endif | ||
398 | |||
335 | /* Stuffs for move charges at task migration. */ | 399 | /* Stuffs for move charges at task migration. */ |
336 | /* | 400 | /* |
337 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | 401 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a |
@@ -386,9 +450,13 @@ enum charge_type { | |||
386 | }; | 450 | }; |
387 | 451 | ||
388 | /* for encoding cft->private value on file */ | 452 | /* for encoding cft->private value on file */ |
389 | #define _MEM (0) | 453 | enum res_type { |
390 | #define _MEMSWAP (1) | 454 | _MEM, |
391 | #define _OOM_TYPE (2) | 455 | _MEMSWAP, |
456 | _OOM_TYPE, | ||
457 | _KMEM, | ||
458 | }; | ||
459 | |||
392 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) | 460 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) |
393 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) | 461 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) |
394 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 462 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
@@ -485,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) | |||
485 | } | 553 | } |
486 | #endif | 554 | #endif |
487 | 555 | ||
556 | #ifdef CONFIG_MEMCG_KMEM | ||
557 | /* | ||
558 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. | ||
559 | * There are two main reasons for not using the css_id for this: | ||
560 | * 1) this works better in sparse environments, where we have a lot of memcgs, | ||
561 | * but only a few kmem-limited. Or also, if we have, for instance, 200 | ||
562 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a | ||
563 | * 200 entry array for that. | ||
564 | * | ||
565 | * 2) In order not to violate the cgroup API, we would like to do all memory | ||
566 | * allocation in ->create(). At that point, we haven't yet allocated the | ||
567 | * css_id. Having a separate index prevents us from messing with the cgroup | ||
568 | * core for this | ||
569 | * | ||
570 | * The current size of the caches array is stored in | ||
571 | * memcg_limited_groups_array_size. It will double each time we have to | ||
572 | * increase it. | ||
573 | */ | ||
574 | static DEFINE_IDA(kmem_limited_groups); | ||
575 | int memcg_limited_groups_array_size; | ||
576 | |||
577 | /* | ||
578 | * MIN_SIZE is different than 1, because we would like to avoid going through | ||
579 | * the alloc/free process all the time. In a small machine, 4 kmem-limited | ||
580 | * cgroups is a reasonable guess. In the future, it could be a parameter or | ||
581 | * tunable, but that is strictly not necessary. | ||
582 | * | ||
583 | * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get | ||
584 | * this constant directly from cgroup, but it is understandable that this is | ||
585 | * better kept as an internal representation in cgroup.c. In any case, the | ||
586 | * css_id space is not getting any smaller, and we don't have to necessarily | ||
587 | * increase ours as well if it increases. | ||
588 | */ | ||
589 | #define MEMCG_CACHES_MIN_SIZE 4 | ||
590 | #define MEMCG_CACHES_MAX_SIZE 65535 | ||
591 | |||
592 | /* | ||
593 | * A lot of the calls to the cache allocation functions are expected to be | ||
594 | * inlined by the compiler. Since the calls to memcg_kmem_get_cache are | ||
595 | * conditional to this static branch, we'll have to allow modules that does | ||
596 | * kmem_cache_alloc and the such to see this symbol as well | ||
597 | */ | ||
598 | struct static_key memcg_kmem_enabled_key; | ||
599 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | ||
600 | |||
601 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
602 | { | ||
603 | if (memcg_kmem_is_active(memcg)) { | ||
604 | static_key_slow_dec(&memcg_kmem_enabled_key); | ||
605 | ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); | ||
606 | } | ||
607 | /* | ||
608 | * This check can't live in kmem destruction function, | ||
609 | * since the charges will outlive the cgroup | ||
610 | */ | ||
611 | WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); | ||
612 | } | ||
613 | #else | ||
614 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
615 | { | ||
616 | } | ||
617 | #endif /* CONFIG_MEMCG_KMEM */ | ||
618 | |||
619 | static void disarm_static_keys(struct mem_cgroup *memcg) | ||
620 | { | ||
621 | disarm_sock_keys(memcg); | ||
622 | disarm_kmem_keys(memcg); | ||
623 | } | ||
624 | |||
488 | static void drain_all_stock_async(struct mem_cgroup *memcg); | 625 | static void drain_all_stock_async(struct mem_cgroup *memcg); |
489 | 626 | ||
490 | static struct mem_cgroup_per_zone * | 627 | static struct mem_cgroup_per_zone * |
@@ -800,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | |||
800 | int nid; | 937 | int nid; |
801 | u64 total = 0; | 938 | u64 total = 0; |
802 | 939 | ||
803 | for_each_node_state(nid, N_HIGH_MEMORY) | 940 | for_each_node_state(nid, N_MEMORY) |
804 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); | 941 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); |
805 | return total; | 942 | return total; |
806 | } | 943 | } |
@@ -1015,13 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
1015 | iter != NULL; \ | 1152 | iter != NULL; \ |
1016 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1153 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
1017 | 1154 | ||
1018 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1155 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
1019 | { | 1156 | { |
1020 | struct mem_cgroup *memcg; | 1157 | struct mem_cgroup *memcg; |
1021 | 1158 | ||
1022 | if (!mm) | ||
1023 | return; | ||
1024 | |||
1025 | rcu_read_lock(); | 1159 | rcu_read_lock(); |
1026 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1160 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
1027 | if (unlikely(!memcg)) | 1161 | if (unlikely(!memcg)) |
@@ -1040,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | |||
1040 | out: | 1174 | out: |
1041 | rcu_read_unlock(); | 1175 | rcu_read_unlock(); |
1042 | } | 1176 | } |
1043 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); | 1177 | EXPORT_SYMBOL(__mem_cgroup_count_vm_event); |
1044 | 1178 | ||
1045 | /** | 1179 | /** |
1046 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 1180 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
@@ -1454,6 +1588,10 @@ done: | |||
1454 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1588 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, |
1455 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1589 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, |
1456 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1590 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); |
1591 | printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", | ||
1592 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, | ||
1593 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, | ||
1594 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); | ||
1457 | } | 1595 | } |
1458 | 1596 | ||
1459 | /* | 1597 | /* |
@@ -1498,8 +1636,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1498 | return limit; | 1636 | return limit; |
1499 | } | 1637 | } |
1500 | 1638 | ||
1501 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | 1639 | static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
1502 | int order) | 1640 | int order) |
1503 | { | 1641 | { |
1504 | struct mem_cgroup *iter; | 1642 | struct mem_cgroup *iter; |
1505 | unsigned long chosen_points = 0; | 1643 | unsigned long chosen_points = 0; |
@@ -1644,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) | |||
1644 | return; | 1782 | return; |
1645 | 1783 | ||
1646 | /* make a nodemask where this memcg uses memory from */ | 1784 | /* make a nodemask where this memcg uses memory from */ |
1647 | memcg->scan_nodes = node_states[N_HIGH_MEMORY]; | 1785 | memcg->scan_nodes = node_states[N_MEMORY]; |
1648 | 1786 | ||
1649 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1787 | for_each_node_mask(nid, node_states[N_MEMORY]) { |
1650 | 1788 | ||
1651 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) | 1789 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) |
1652 | node_clear(nid, memcg->scan_nodes); | 1790 | node_clear(nid, memcg->scan_nodes); |
@@ -1717,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | |||
1717 | /* | 1855 | /* |
1718 | * Check rest of nodes. | 1856 | * Check rest of nodes. |
1719 | */ | 1857 | */ |
1720 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1858 | for_each_node_state(nid, N_MEMORY) { |
1721 | if (node_isset(nid, memcg->scan_nodes)) | 1859 | if (node_isset(nid, memcg->scan_nodes)) |
1722 | continue; | 1860 | continue; |
1723 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | 1861 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
@@ -2061,20 +2199,28 @@ struct memcg_stock_pcp { | |||
2061 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 2199 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
2062 | static DEFINE_MUTEX(percpu_charge_mutex); | 2200 | static DEFINE_MUTEX(percpu_charge_mutex); |
2063 | 2201 | ||
2064 | /* | 2202 | /** |
2065 | * Try to consume stocked charge on this cpu. If success, one page is consumed | 2203 | * consume_stock: Try to consume stocked charge on this cpu. |
2066 | * from local stock and true is returned. If the stock is 0 or charges from a | 2204 | * @memcg: memcg to consume from. |
2067 | * cgroup which is not current target, returns false. This stock will be | 2205 | * @nr_pages: how many pages to charge. |
2068 | * refilled. | 2206 | * |
2207 | * The charges will only happen if @memcg matches the current cpu's memcg | ||
2208 | * stock, and at least @nr_pages are available in that stock. Failure to | ||
2209 | * service an allocation will refill the stock. | ||
2210 | * | ||
2211 | * returns true if successful, false otherwise. | ||
2069 | */ | 2212 | */ |
2070 | static bool consume_stock(struct mem_cgroup *memcg) | 2213 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2071 | { | 2214 | { |
2072 | struct memcg_stock_pcp *stock; | 2215 | struct memcg_stock_pcp *stock; |
2073 | bool ret = true; | 2216 | bool ret = true; |
2074 | 2217 | ||
2218 | if (nr_pages > CHARGE_BATCH) | ||
2219 | return false; | ||
2220 | |||
2075 | stock = &get_cpu_var(memcg_stock); | 2221 | stock = &get_cpu_var(memcg_stock); |
2076 | if (memcg == stock->cached && stock->nr_pages) | 2222 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) |
2077 | stock->nr_pages--; | 2223 | stock->nr_pages -= nr_pages; |
2078 | else /* need to call res_counter_charge */ | 2224 | else /* need to call res_counter_charge */ |
2079 | ret = false; | 2225 | ret = false; |
2080 | put_cpu_var(memcg_stock); | 2226 | put_cpu_var(memcg_stock); |
@@ -2251,7 +2397,8 @@ enum { | |||
2251 | }; | 2397 | }; |
2252 | 2398 | ||
2253 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2399 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2254 | unsigned int nr_pages, bool oom_check) | 2400 | unsigned int nr_pages, unsigned int min_pages, |
2401 | bool oom_check) | ||
2255 | { | 2402 | { |
2256 | unsigned long csize = nr_pages * PAGE_SIZE; | 2403 | unsigned long csize = nr_pages * PAGE_SIZE; |
2257 | struct mem_cgroup *mem_over_limit; | 2404 | struct mem_cgroup *mem_over_limit; |
@@ -2274,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2274 | } else | 2421 | } else |
2275 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 2422 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
2276 | /* | 2423 | /* |
2277 | * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch | ||
2278 | * of regular pages (CHARGE_BATCH), or a single regular page (1). | ||
2279 | * | ||
2280 | * Never reclaim on behalf of optional batching, retry with a | 2424 | * Never reclaim on behalf of optional batching, retry with a |
2281 | * single page instead. | 2425 | * single page instead. |
2282 | */ | 2426 | */ |
2283 | if (nr_pages == CHARGE_BATCH) | 2427 | if (nr_pages > min_pages) |
2284 | return CHARGE_RETRY; | 2428 | return CHARGE_RETRY; |
2285 | 2429 | ||
2286 | if (!(gfp_mask & __GFP_WAIT)) | 2430 | if (!(gfp_mask & __GFP_WAIT)) |
2287 | return CHARGE_WOULDBLOCK; | 2431 | return CHARGE_WOULDBLOCK; |
2288 | 2432 | ||
2433 | if (gfp_mask & __GFP_NORETRY) | ||
2434 | return CHARGE_NOMEM; | ||
2435 | |||
2289 | ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); | 2436 | ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); |
2290 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2437 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2291 | return CHARGE_RETRY; | 2438 | return CHARGE_RETRY; |
@@ -2298,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2298 | * unlikely to succeed so close to the limit, and we fall back | 2445 | * unlikely to succeed so close to the limit, and we fall back |
2299 | * to regular pages anyway in case of failure. | 2446 | * to regular pages anyway in case of failure. |
2300 | */ | 2447 | */ |
2301 | if (nr_pages == 1 && ret) | 2448 | if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) |
2302 | return CHARGE_RETRY; | 2449 | return CHARGE_RETRY; |
2303 | 2450 | ||
2304 | /* | 2451 | /* |
@@ -2370,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2370 | again: | 2517 | again: |
2371 | if (*ptr) { /* css should be a valid one */ | 2518 | if (*ptr) { /* css should be a valid one */ |
2372 | memcg = *ptr; | 2519 | memcg = *ptr; |
2373 | VM_BUG_ON(css_is_removed(&memcg->css)); | ||
2374 | if (mem_cgroup_is_root(memcg)) | 2520 | if (mem_cgroup_is_root(memcg)) |
2375 | goto done; | 2521 | goto done; |
2376 | if (nr_pages == 1 && consume_stock(memcg)) | 2522 | if (consume_stock(memcg, nr_pages)) |
2377 | goto done; | 2523 | goto done; |
2378 | css_get(&memcg->css); | 2524 | css_get(&memcg->css); |
2379 | } else { | 2525 | } else { |
@@ -2398,7 +2544,7 @@ again: | |||
2398 | rcu_read_unlock(); | 2544 | rcu_read_unlock(); |
2399 | goto done; | 2545 | goto done; |
2400 | } | 2546 | } |
2401 | if (nr_pages == 1 && consume_stock(memcg)) { | 2547 | if (consume_stock(memcg, nr_pages)) { |
2402 | /* | 2548 | /* |
2403 | * It seems dagerous to access memcg without css_get(). | 2549 | * It seems dagerous to access memcg without css_get(). |
2404 | * But considering how consume_stok works, it's not | 2550 | * But considering how consume_stok works, it's not |
@@ -2433,7 +2579,8 @@ again: | |||
2433 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2579 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2434 | } | 2580 | } |
2435 | 2581 | ||
2436 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); | 2582 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, |
2583 | oom_check); | ||
2437 | switch (ret) { | 2584 | switch (ret) { |
2438 | case CHARGE_OK: | 2585 | case CHARGE_OK: |
2439 | break; | 2586 | break; |
@@ -2510,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | |||
2510 | 2657 | ||
2511 | /* | 2658 | /* |
2512 | * A helper function to get mem_cgroup from ID. must be called under | 2659 | * A helper function to get mem_cgroup from ID. must be called under |
2513 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2660 | * rcu_read_lock(). The caller is responsible for calling css_tryget if |
2514 | * it's concern. (dropping refcnt from swap can be called against removed | 2661 | * the mem_cgroup is used for charging. (dropping refcnt from swap can be |
2515 | * memcg.) | 2662 | * called against removed memcg.) |
2516 | */ | 2663 | */ |
2517 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | 2664 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) |
2518 | { | 2665 | { |
@@ -2626,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2626 | memcg_check_events(memcg, page); | 2773 | memcg_check_events(memcg, page); |
2627 | } | 2774 | } |
2628 | 2775 | ||
2776 | static DEFINE_MUTEX(set_limit_mutex); | ||
2777 | |||
2778 | #ifdef CONFIG_MEMCG_KMEM | ||
2779 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | ||
2780 | { | ||
2781 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | ||
2782 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); | ||
2783 | } | ||
2784 | |||
2785 | /* | ||
2786 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | ||
2787 | * in the memcg_cache_params struct. | ||
2788 | */ | ||
2789 | static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | ||
2790 | { | ||
2791 | struct kmem_cache *cachep; | ||
2792 | |||
2793 | VM_BUG_ON(p->is_root_cache); | ||
2794 | cachep = p->root_cache; | ||
2795 | return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; | ||
2796 | } | ||
2797 | |||
2798 | #ifdef CONFIG_SLABINFO | ||
2799 | static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, | ||
2800 | struct seq_file *m) | ||
2801 | { | ||
2802 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | ||
2803 | struct memcg_cache_params *params; | ||
2804 | |||
2805 | if (!memcg_can_account_kmem(memcg)) | ||
2806 | return -EIO; | ||
2807 | |||
2808 | print_slabinfo_header(m); | ||
2809 | |||
2810 | mutex_lock(&memcg->slab_caches_mutex); | ||
2811 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) | ||
2812 | cache_show(memcg_params_to_cache(params), m); | ||
2813 | mutex_unlock(&memcg->slab_caches_mutex); | ||
2814 | |||
2815 | return 0; | ||
2816 | } | ||
2817 | #endif | ||
2818 | |||
2819 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | ||
2820 | { | ||
2821 | struct res_counter *fail_res; | ||
2822 | struct mem_cgroup *_memcg; | ||
2823 | int ret = 0; | ||
2824 | bool may_oom; | ||
2825 | |||
2826 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); | ||
2827 | if (ret) | ||
2828 | return ret; | ||
2829 | |||
2830 | /* | ||
2831 | * Conditions under which we can wait for the oom_killer. Those are | ||
2832 | * the same conditions tested by the core page allocator | ||
2833 | */ | ||
2834 | may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); | ||
2835 | |||
2836 | _memcg = memcg; | ||
2837 | ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, | ||
2838 | &_memcg, may_oom); | ||
2839 | |||
2840 | if (ret == -EINTR) { | ||
2841 | /* | ||
2842 | * __mem_cgroup_try_charge() chosed to bypass to root due to | ||
2843 | * OOM kill or fatal signal. Since our only options are to | ||
2844 | * either fail the allocation or charge it to this cgroup, do | ||
2845 | * it as a temporary condition. But we can't fail. From a | ||
2846 | * kmem/slab perspective, the cache has already been selected, | ||
2847 | * by mem_cgroup_kmem_get_cache(), so it is too late to change | ||
2848 | * our minds. | ||
2849 | * | ||
2850 | * This condition will only trigger if the task entered | ||
2851 | * memcg_charge_kmem in a sane state, but was OOM-killed during | ||
2852 | * __mem_cgroup_try_charge() above. Tasks that were already | ||
2853 | * dying when the allocation triggers should have been already | ||
2854 | * directed to the root cgroup in memcontrol.h | ||
2855 | */ | ||
2856 | res_counter_charge_nofail(&memcg->res, size, &fail_res); | ||
2857 | if (do_swap_account) | ||
2858 | res_counter_charge_nofail(&memcg->memsw, size, | ||
2859 | &fail_res); | ||
2860 | ret = 0; | ||
2861 | } else if (ret) | ||
2862 | res_counter_uncharge(&memcg->kmem, size); | ||
2863 | |||
2864 | return ret; | ||
2865 | } | ||
2866 | |||
2867 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) | ||
2868 | { | ||
2869 | res_counter_uncharge(&memcg->res, size); | ||
2870 | if (do_swap_account) | ||
2871 | res_counter_uncharge(&memcg->memsw, size); | ||
2872 | |||
2873 | /* Not down to 0 */ | ||
2874 | if (res_counter_uncharge(&memcg->kmem, size)) | ||
2875 | return; | ||
2876 | |||
2877 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
2878 | mem_cgroup_put(memcg); | ||
2879 | } | ||
2880 | |||
2881 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) | ||
2882 | { | ||
2883 | if (!memcg) | ||
2884 | return; | ||
2885 | |||
2886 | mutex_lock(&memcg->slab_caches_mutex); | ||
2887 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | ||
2888 | mutex_unlock(&memcg->slab_caches_mutex); | ||
2889 | } | ||
2890 | |||
2891 | /* | ||
2892 | * helper for acessing a memcg's index. It will be used as an index in the | ||
2893 | * child cache array in kmem_cache, and also to derive its name. This function | ||
2894 | * will return -1 when this is not a kmem-limited memcg. | ||
2895 | */ | ||
2896 | int memcg_cache_id(struct mem_cgroup *memcg) | ||
2897 | { | ||
2898 | return memcg ? memcg->kmemcg_id : -1; | ||
2899 | } | ||
2900 | |||
2901 | /* | ||
2902 | * This ends up being protected by the set_limit mutex, during normal | ||
2903 | * operation, because that is its main call site. | ||
2904 | * | ||
2905 | * But when we create a new cache, we can call this as well if its parent | ||
2906 | * is kmem-limited. That will have to hold set_limit_mutex as well. | ||
2907 | */ | ||
2908 | int memcg_update_cache_sizes(struct mem_cgroup *memcg) | ||
2909 | { | ||
2910 | int num, ret; | ||
2911 | |||
2912 | num = ida_simple_get(&kmem_limited_groups, | ||
2913 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
2914 | if (num < 0) | ||
2915 | return num; | ||
2916 | /* | ||
2917 | * After this point, kmem_accounted (that we test atomically in | ||
2918 | * the beginning of this conditional), is no longer 0. This | ||
2919 | * guarantees only one process will set the following boolean | ||
2920 | * to true. We don't need test_and_set because we're protected | ||
2921 | * by the set_limit_mutex anyway. | ||
2922 | */ | ||
2923 | memcg_kmem_set_activated(memcg); | ||
2924 | |||
2925 | ret = memcg_update_all_caches(num+1); | ||
2926 | if (ret) { | ||
2927 | ida_simple_remove(&kmem_limited_groups, num); | ||
2928 | memcg_kmem_clear_activated(memcg); | ||
2929 | return ret; | ||
2930 | } | ||
2931 | |||
2932 | memcg->kmemcg_id = num; | ||
2933 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
2934 | mutex_init(&memcg->slab_caches_mutex); | ||
2935 | return 0; | ||
2936 | } | ||
2937 | |||
2938 | static size_t memcg_caches_array_size(int num_groups) | ||
2939 | { | ||
2940 | ssize_t size; | ||
2941 | if (num_groups <= 0) | ||
2942 | return 0; | ||
2943 | |||
2944 | size = 2 * num_groups; | ||
2945 | if (size < MEMCG_CACHES_MIN_SIZE) | ||
2946 | size = MEMCG_CACHES_MIN_SIZE; | ||
2947 | else if (size > MEMCG_CACHES_MAX_SIZE) | ||
2948 | size = MEMCG_CACHES_MAX_SIZE; | ||
2949 | |||
2950 | return size; | ||
2951 | } | ||
2952 | |||
2953 | /* | ||
2954 | * We should update the current array size iff all caches updates succeed. This | ||
2955 | * can only be done from the slab side. The slab mutex needs to be held when | ||
2956 | * calling this. | ||
2957 | */ | ||
2958 | void memcg_update_array_size(int num) | ||
2959 | { | ||
2960 | if (num > memcg_limited_groups_array_size) | ||
2961 | memcg_limited_groups_array_size = memcg_caches_array_size(num); | ||
2962 | } | ||
2963 | |||
2964 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | ||
2965 | { | ||
2966 | struct memcg_cache_params *cur_params = s->memcg_params; | ||
2967 | |||
2968 | VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); | ||
2969 | |||
2970 | if (num_groups > memcg_limited_groups_array_size) { | ||
2971 | int i; | ||
2972 | ssize_t size = memcg_caches_array_size(num_groups); | ||
2973 | |||
2974 | size *= sizeof(void *); | ||
2975 | size += sizeof(struct memcg_cache_params); | ||
2976 | |||
2977 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
2978 | if (!s->memcg_params) { | ||
2979 | s->memcg_params = cur_params; | ||
2980 | return -ENOMEM; | ||
2981 | } | ||
2982 | |||
2983 | s->memcg_params->is_root_cache = true; | ||
2984 | |||
2985 | /* | ||
2986 | * There is the chance it will be bigger than | ||
2987 | * memcg_limited_groups_array_size, if we failed an allocation | ||
2988 | * in a cache, in which case all caches updated before it, will | ||
2989 | * have a bigger array. | ||
2990 | * | ||
2991 | * But if that is the case, the data after | ||
2992 | * memcg_limited_groups_array_size is certainly unused | ||
2993 | */ | ||
2994 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | ||
2995 | if (!cur_params->memcg_caches[i]) | ||
2996 | continue; | ||
2997 | s->memcg_params->memcg_caches[i] = | ||
2998 | cur_params->memcg_caches[i]; | ||
2999 | } | ||
3000 | |||
3001 | /* | ||
3002 | * Ideally, we would wait until all caches succeed, and only | ||
3003 | * then free the old one. But this is not worth the extra | ||
3004 | * pointer per-cache we'd have to have for this. | ||
3005 | * | ||
3006 | * It is not a big deal if some caches are left with a size | ||
3007 | * bigger than the others. And all updates will reset this | ||
3008 | * anyway. | ||
3009 | */ | ||
3010 | kfree(cur_params); | ||
3011 | } | ||
3012 | return 0; | ||
3013 | } | ||
3014 | |||
3015 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | ||
3016 | struct kmem_cache *root_cache) | ||
3017 | { | ||
3018 | size_t size = sizeof(struct memcg_cache_params); | ||
3019 | |||
3020 | if (!memcg_kmem_enabled()) | ||
3021 | return 0; | ||
3022 | |||
3023 | if (!memcg) | ||
3024 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
3025 | |||
3026 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
3027 | if (!s->memcg_params) | ||
3028 | return -ENOMEM; | ||
3029 | |||
3030 | if (memcg) { | ||
3031 | s->memcg_params->memcg = memcg; | ||
3032 | s->memcg_params->root_cache = root_cache; | ||
3033 | } | ||
3034 | return 0; | ||
3035 | } | ||
3036 | |||
3037 | void memcg_release_cache(struct kmem_cache *s) | ||
3038 | { | ||
3039 | struct kmem_cache *root; | ||
3040 | struct mem_cgroup *memcg; | ||
3041 | int id; | ||
3042 | |||
3043 | /* | ||
3044 | * This happens, for instance, when a root cache goes away before we | ||
3045 | * add any memcg. | ||
3046 | */ | ||
3047 | if (!s->memcg_params) | ||
3048 | return; | ||
3049 | |||
3050 | if (s->memcg_params->is_root_cache) | ||
3051 | goto out; | ||
3052 | |||
3053 | memcg = s->memcg_params->memcg; | ||
3054 | id = memcg_cache_id(memcg); | ||
3055 | |||
3056 | root = s->memcg_params->root_cache; | ||
3057 | root->memcg_params->memcg_caches[id] = NULL; | ||
3058 | mem_cgroup_put(memcg); | ||
3059 | |||
3060 | mutex_lock(&memcg->slab_caches_mutex); | ||
3061 | list_del(&s->memcg_params->list); | ||
3062 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3063 | |||
3064 | out: | ||
3065 | kfree(s->memcg_params); | ||
3066 | } | ||
3067 | |||
3068 | /* | ||
3069 | * During the creation a new cache, we need to disable our accounting mechanism | ||
3070 | * altogether. This is true even if we are not creating, but rather just | ||
3071 | * enqueing new caches to be created. | ||
3072 | * | ||
3073 | * This is because that process will trigger allocations; some visible, like | ||
3074 | * explicit kmallocs to auxiliary data structures, name strings and internal | ||
3075 | * cache structures; some well concealed, like INIT_WORK() that can allocate | ||
3076 | * objects during debug. | ||
3077 | * | ||
3078 | * If any allocation happens during memcg_kmem_get_cache, we will recurse back | ||
3079 | * to it. This may not be a bounded recursion: since the first cache creation | ||
3080 | * failed to complete (waiting on the allocation), we'll just try to create the | ||
3081 | * cache again, failing at the same point. | ||
3082 | * | ||
3083 | * memcg_kmem_get_cache is prepared to abort after seeing a positive count of | ||
3084 | * memcg_kmem_skip_account. So we enclose anything that might allocate memory | ||
3085 | * inside the following two functions. | ||
3086 | */ | ||
3087 | static inline void memcg_stop_kmem_account(void) | ||
3088 | { | ||
3089 | VM_BUG_ON(!current->mm); | ||
3090 | current->memcg_kmem_skip_account++; | ||
3091 | } | ||
3092 | |||
3093 | static inline void memcg_resume_kmem_account(void) | ||
3094 | { | ||
3095 | VM_BUG_ON(!current->mm); | ||
3096 | current->memcg_kmem_skip_account--; | ||
3097 | } | ||
3098 | |||
3099 | static void kmem_cache_destroy_work_func(struct work_struct *w) | ||
3100 | { | ||
3101 | struct kmem_cache *cachep; | ||
3102 | struct memcg_cache_params *p; | ||
3103 | |||
3104 | p = container_of(w, struct memcg_cache_params, destroy); | ||
3105 | |||
3106 | cachep = memcg_params_to_cache(p); | ||
3107 | |||
3108 | /* | ||
3109 | * If we get down to 0 after shrink, we could delete right away. | ||
3110 | * However, memcg_release_pages() already puts us back in the workqueue | ||
3111 | * in that case. If we proceed deleting, we'll get a dangling | ||
3112 | * reference, and removing the object from the workqueue in that case | ||
3113 | * is unnecessary complication. We are not a fast path. | ||
3114 | * | ||
3115 | * Note that this case is fundamentally different from racing with | ||
3116 | * shrink_slab(): if memcg_cgroup_destroy_cache() is called in | ||
3117 | * kmem_cache_shrink, not only we would be reinserting a dead cache | ||
3118 | * into the queue, but doing so from inside the worker racing to | ||
3119 | * destroy it. | ||
3120 | * | ||
3121 | * So if we aren't down to zero, we'll just schedule a worker and try | ||
3122 | * again | ||
3123 | */ | ||
3124 | if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { | ||
3125 | kmem_cache_shrink(cachep); | ||
3126 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | ||
3127 | return; | ||
3128 | } else | ||
3129 | kmem_cache_destroy(cachep); | ||
3130 | } | ||
3131 | |||
3132 | void mem_cgroup_destroy_cache(struct kmem_cache *cachep) | ||
3133 | { | ||
3134 | if (!cachep->memcg_params->dead) | ||
3135 | return; | ||
3136 | |||
3137 | /* | ||
3138 | * There are many ways in which we can get here. | ||
3139 | * | ||
3140 | * We can get to a memory-pressure situation while the delayed work is | ||
3141 | * still pending to run. The vmscan shrinkers can then release all | ||
3142 | * cache memory and get us to destruction. If this is the case, we'll | ||
3143 | * be executed twice, which is a bug (the second time will execute over | ||
3144 | * bogus data). In this case, cancelling the work should be fine. | ||
3145 | * | ||
3146 | * But we can also get here from the worker itself, if | ||
3147 | * kmem_cache_shrink is enough to shake all the remaining objects and | ||
3148 | * get the page count to 0. In this case, we'll deadlock if we try to | ||
3149 | * cancel the work (the worker runs with an internal lock held, which | ||
3150 | * is the same lock we would hold for cancel_work_sync().) | ||
3151 | * | ||
3152 | * Since we can't possibly know who got us here, just refrain from | ||
3153 | * running if there is already work pending | ||
3154 | */ | ||
3155 | if (work_pending(&cachep->memcg_params->destroy)) | ||
3156 | return; | ||
3157 | /* | ||
3158 | * We have to defer the actual destroying to a workqueue, because | ||
3159 | * we might currently be in a context that cannot sleep. | ||
3160 | */ | ||
3161 | schedule_work(&cachep->memcg_params->destroy); | ||
3162 | } | ||
3163 | |||
3164 | static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) | ||
3165 | { | ||
3166 | char *name; | ||
3167 | struct dentry *dentry; | ||
3168 | |||
3169 | rcu_read_lock(); | ||
3170 | dentry = rcu_dereference(memcg->css.cgroup->dentry); | ||
3171 | rcu_read_unlock(); | ||
3172 | |||
3173 | BUG_ON(dentry == NULL); | ||
3174 | |||
3175 | name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, | ||
3176 | memcg_cache_id(memcg), dentry->d_name.name); | ||
3177 | |||
3178 | return name; | ||
3179 | } | ||
3180 | |||
3181 | static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, | ||
3182 | struct kmem_cache *s) | ||
3183 | { | ||
3184 | char *name; | ||
3185 | struct kmem_cache *new; | ||
3186 | |||
3187 | name = memcg_cache_name(memcg, s); | ||
3188 | if (!name) | ||
3189 | return NULL; | ||
3190 | |||
3191 | new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, | ||
3192 | (s->flags & ~SLAB_PANIC), s->ctor, s); | ||
3193 | |||
3194 | if (new) | ||
3195 | new->allocflags |= __GFP_KMEMCG; | ||
3196 | |||
3197 | kfree(name); | ||
3198 | return new; | ||
3199 | } | ||
3200 | |||
3201 | /* | ||
3202 | * This lock protects updaters, not readers. We want readers to be as fast as | ||
3203 | * they can, and they will either see NULL or a valid cache value. Our model | ||
3204 | * allow them to see NULL, in which case the root memcg will be selected. | ||
3205 | * | ||
3206 | * We need this lock because multiple allocations to the same cache from a non | ||
3207 | * will span more than one worker. Only one of them can create the cache. | ||
3208 | */ | ||
3209 | static DEFINE_MUTEX(memcg_cache_mutex); | ||
3210 | static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | ||
3211 | struct kmem_cache *cachep) | ||
3212 | { | ||
3213 | struct kmem_cache *new_cachep; | ||
3214 | int idx; | ||
3215 | |||
3216 | BUG_ON(!memcg_can_account_kmem(memcg)); | ||
3217 | |||
3218 | idx = memcg_cache_id(memcg); | ||
3219 | |||
3220 | mutex_lock(&memcg_cache_mutex); | ||
3221 | new_cachep = cachep->memcg_params->memcg_caches[idx]; | ||
3222 | if (new_cachep) | ||
3223 | goto out; | ||
3224 | |||
3225 | new_cachep = kmem_cache_dup(memcg, cachep); | ||
3226 | if (new_cachep == NULL) { | ||
3227 | new_cachep = cachep; | ||
3228 | goto out; | ||
3229 | } | ||
3230 | |||
3231 | mem_cgroup_get(memcg); | ||
3232 | atomic_set(&new_cachep->memcg_params->nr_pages , 0); | ||
3233 | |||
3234 | cachep->memcg_params->memcg_caches[idx] = new_cachep; | ||
3235 | /* | ||
3236 | * the readers won't lock, make sure everybody sees the updated value, | ||
3237 | * so they won't put stuff in the queue again for no reason | ||
3238 | */ | ||
3239 | wmb(); | ||
3240 | out: | ||
3241 | mutex_unlock(&memcg_cache_mutex); | ||
3242 | return new_cachep; | ||
3243 | } | ||
3244 | |||
3245 | void kmem_cache_destroy_memcg_children(struct kmem_cache *s) | ||
3246 | { | ||
3247 | struct kmem_cache *c; | ||
3248 | int i; | ||
3249 | |||
3250 | if (!s->memcg_params) | ||
3251 | return; | ||
3252 | if (!s->memcg_params->is_root_cache) | ||
3253 | return; | ||
3254 | |||
3255 | /* | ||
3256 | * If the cache is being destroyed, we trust that there is no one else | ||
3257 | * requesting objects from it. Even if there are, the sanity checks in | ||
3258 | * kmem_cache_destroy should caught this ill-case. | ||
3259 | * | ||
3260 | * Still, we don't want anyone else freeing memcg_caches under our | ||
3261 | * noses, which can happen if a new memcg comes to life. As usual, | ||
3262 | * we'll take the set_limit_mutex to protect ourselves against this. | ||
3263 | */ | ||
3264 | mutex_lock(&set_limit_mutex); | ||
3265 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | ||
3266 | c = s->memcg_params->memcg_caches[i]; | ||
3267 | if (!c) | ||
3268 | continue; | ||
3269 | |||
3270 | /* | ||
3271 | * We will now manually delete the caches, so to avoid races | ||
3272 | * we need to cancel all pending destruction workers and | ||
3273 | * proceed with destruction ourselves. | ||
3274 | * | ||
3275 | * kmem_cache_destroy() will call kmem_cache_shrink internally, | ||
3276 | * and that could spawn the workers again: it is likely that | ||
3277 | * the cache still have active pages until this very moment. | ||
3278 | * This would lead us back to mem_cgroup_destroy_cache. | ||
3279 | * | ||
3280 | * But that will not execute at all if the "dead" flag is not | ||
3281 | * set, so flip it down to guarantee we are in control. | ||
3282 | */ | ||
3283 | c->memcg_params->dead = false; | ||
3284 | cancel_work_sync(&c->memcg_params->destroy); | ||
3285 | kmem_cache_destroy(c); | ||
3286 | } | ||
3287 | mutex_unlock(&set_limit_mutex); | ||
3288 | } | ||
3289 | |||
3290 | struct create_work { | ||
3291 | struct mem_cgroup *memcg; | ||
3292 | struct kmem_cache *cachep; | ||
3293 | struct work_struct work; | ||
3294 | }; | ||
3295 | |||
3296 | static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | ||
3297 | { | ||
3298 | struct kmem_cache *cachep; | ||
3299 | struct memcg_cache_params *params; | ||
3300 | |||
3301 | if (!memcg_kmem_is_active(memcg)) | ||
3302 | return; | ||
3303 | |||
3304 | mutex_lock(&memcg->slab_caches_mutex); | ||
3305 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) { | ||
3306 | cachep = memcg_params_to_cache(params); | ||
3307 | cachep->memcg_params->dead = true; | ||
3308 | INIT_WORK(&cachep->memcg_params->destroy, | ||
3309 | kmem_cache_destroy_work_func); | ||
3310 | schedule_work(&cachep->memcg_params->destroy); | ||
3311 | } | ||
3312 | mutex_unlock(&memcg->slab_caches_mutex); | ||
3313 | } | ||
3314 | |||
3315 | static void memcg_create_cache_work_func(struct work_struct *w) | ||
3316 | { | ||
3317 | struct create_work *cw; | ||
3318 | |||
3319 | cw = container_of(w, struct create_work, work); | ||
3320 | memcg_create_kmem_cache(cw->memcg, cw->cachep); | ||
3321 | /* Drop the reference gotten when we enqueued. */ | ||
3322 | css_put(&cw->memcg->css); | ||
3323 | kfree(cw); | ||
3324 | } | ||
3325 | |||
3326 | /* | ||
3327 | * Enqueue the creation of a per-memcg kmem_cache. | ||
3328 | * Called with rcu_read_lock. | ||
3329 | */ | ||
3330 | static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, | ||
3331 | struct kmem_cache *cachep) | ||
3332 | { | ||
3333 | struct create_work *cw; | ||
3334 | |||
3335 | cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); | ||
3336 | if (cw == NULL) | ||
3337 | return; | ||
3338 | |||
3339 | /* The corresponding put will be done in the workqueue. */ | ||
3340 | if (!css_tryget(&memcg->css)) { | ||
3341 | kfree(cw); | ||
3342 | return; | ||
3343 | } | ||
3344 | |||
3345 | cw->memcg = memcg; | ||
3346 | cw->cachep = cachep; | ||
3347 | |||
3348 | INIT_WORK(&cw->work, memcg_create_cache_work_func); | ||
3349 | schedule_work(&cw->work); | ||
3350 | } | ||
3351 | |||
3352 | static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, | ||
3353 | struct kmem_cache *cachep) | ||
3354 | { | ||
3355 | /* | ||
3356 | * We need to stop accounting when we kmalloc, because if the | ||
3357 | * corresponding kmalloc cache is not yet created, the first allocation | ||
3358 | * in __memcg_create_cache_enqueue will recurse. | ||
3359 | * | ||
3360 | * However, it is better to enclose the whole function. Depending on | ||
3361 | * the debugging options enabled, INIT_WORK(), for instance, can | ||
3362 | * trigger an allocation. This too, will make us recurse. Because at | ||
3363 | * this point we can't allow ourselves back into memcg_kmem_get_cache, | ||
3364 | * the safest choice is to do it like this, wrapping the whole function. | ||
3365 | */ | ||
3366 | memcg_stop_kmem_account(); | ||
3367 | __memcg_create_cache_enqueue(memcg, cachep); | ||
3368 | memcg_resume_kmem_account(); | ||
3369 | } | ||
3370 | /* | ||
3371 | * Return the kmem_cache we're supposed to use for a slab allocation. | ||
3372 | * We try to use the current memcg's version of the cache. | ||
3373 | * | ||
3374 | * If the cache does not exist yet, if we are the first user of it, | ||
3375 | * we either create it immediately, if possible, or create it asynchronously | ||
3376 | * in a workqueue. | ||
3377 | * In the latter case, we will let the current allocation go through with | ||
3378 | * the original cache. | ||
3379 | * | ||
3380 | * Can't be called in interrupt context or from kernel threads. | ||
3381 | * This function needs to be called with rcu_read_lock() held. | ||
3382 | */ | ||
3383 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | ||
3384 | gfp_t gfp) | ||
3385 | { | ||
3386 | struct mem_cgroup *memcg; | ||
3387 | int idx; | ||
3388 | |||
3389 | VM_BUG_ON(!cachep->memcg_params); | ||
3390 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | ||
3391 | |||
3392 | if (!current->mm || current->memcg_kmem_skip_account) | ||
3393 | return cachep; | ||
3394 | |||
3395 | rcu_read_lock(); | ||
3396 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | ||
3397 | rcu_read_unlock(); | ||
3398 | |||
3399 | if (!memcg_can_account_kmem(memcg)) | ||
3400 | return cachep; | ||
3401 | |||
3402 | idx = memcg_cache_id(memcg); | ||
3403 | |||
3404 | /* | ||
3405 | * barrier to mare sure we're always seeing the up to date value. The | ||
3406 | * code updating memcg_caches will issue a write barrier to match this. | ||
3407 | */ | ||
3408 | read_barrier_depends(); | ||
3409 | if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { | ||
3410 | /* | ||
3411 | * If we are in a safe context (can wait, and not in interrupt | ||
3412 | * context), we could be be predictable and return right away. | ||
3413 | * This would guarantee that the allocation being performed | ||
3414 | * already belongs in the new cache. | ||
3415 | * | ||
3416 | * However, there are some clashes that can arrive from locking. | ||
3417 | * For instance, because we acquire the slab_mutex while doing | ||
3418 | * kmem_cache_dup, this means no further allocation could happen | ||
3419 | * with the slab_mutex held. | ||
3420 | * | ||
3421 | * Also, because cache creation issue get_online_cpus(), this | ||
3422 | * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, | ||
3423 | * that ends up reversed during cpu hotplug. (cpuset allocates | ||
3424 | * a bunch of GFP_KERNEL memory during cpuup). Due to all that, | ||
3425 | * better to defer everything. | ||
3426 | */ | ||
3427 | memcg_create_cache_enqueue(memcg, cachep); | ||
3428 | return cachep; | ||
3429 | } | ||
3430 | |||
3431 | return cachep->memcg_params->memcg_caches[idx]; | ||
3432 | } | ||
3433 | EXPORT_SYMBOL(__memcg_kmem_get_cache); | ||
3434 | |||
3435 | /* | ||
3436 | * We need to verify if the allocation against current->mm->owner's memcg is | ||
3437 | * possible for the given order. But the page is not allocated yet, so we'll | ||
3438 | * need a further commit step to do the final arrangements. | ||
3439 | * | ||
3440 | * It is possible for the task to switch cgroups in this mean time, so at | ||
3441 | * commit time, we can't rely on task conversion any longer. We'll then use | ||
3442 | * the handle argument to return to the caller which cgroup we should commit | ||
3443 | * against. We could also return the memcg directly and avoid the pointer | ||
3444 | * passing, but a boolean return value gives better semantics considering | ||
3445 | * the compiled-out case as well. | ||
3446 | * | ||
3447 | * Returning true means the allocation is possible. | ||
3448 | */ | ||
3449 | bool | ||
3450 | __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | ||
3451 | { | ||
3452 | struct mem_cgroup *memcg; | ||
3453 | int ret; | ||
3454 | |||
3455 | *_memcg = NULL; | ||
3456 | memcg = try_get_mem_cgroup_from_mm(current->mm); | ||
3457 | |||
3458 | /* | ||
3459 | * very rare case described in mem_cgroup_from_task. Unfortunately there | ||
3460 | * isn't much we can do without complicating this too much, and it would | ||
3461 | * be gfp-dependent anyway. Just let it go | ||
3462 | */ | ||
3463 | if (unlikely(!memcg)) | ||
3464 | return true; | ||
3465 | |||
3466 | if (!memcg_can_account_kmem(memcg)) { | ||
3467 | css_put(&memcg->css); | ||
3468 | return true; | ||
3469 | } | ||
3470 | |||
3471 | ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); | ||
3472 | if (!ret) | ||
3473 | *_memcg = memcg; | ||
3474 | |||
3475 | css_put(&memcg->css); | ||
3476 | return (ret == 0); | ||
3477 | } | ||
3478 | |||
3479 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, | ||
3480 | int order) | ||
3481 | { | ||
3482 | struct page_cgroup *pc; | ||
3483 | |||
3484 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | ||
3485 | |||
3486 | /* The page allocation failed. Revert */ | ||
3487 | if (!page) { | ||
3488 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | ||
3489 | return; | ||
3490 | } | ||
3491 | |||
3492 | pc = lookup_page_cgroup(page); | ||
3493 | lock_page_cgroup(pc); | ||
3494 | pc->mem_cgroup = memcg; | ||
3495 | SetPageCgroupUsed(pc); | ||
3496 | unlock_page_cgroup(pc); | ||
3497 | } | ||
3498 | |||
3499 | void __memcg_kmem_uncharge_pages(struct page *page, int order) | ||
3500 | { | ||
3501 | struct mem_cgroup *memcg = NULL; | ||
3502 | struct page_cgroup *pc; | ||
3503 | |||
3504 | |||
3505 | pc = lookup_page_cgroup(page); | ||
3506 | /* | ||
3507 | * Fast unlocked return. Theoretically might have changed, have to | ||
3508 | * check again after locking. | ||
3509 | */ | ||
3510 | if (!PageCgroupUsed(pc)) | ||
3511 | return; | ||
3512 | |||
3513 | lock_page_cgroup(pc); | ||
3514 | if (PageCgroupUsed(pc)) { | ||
3515 | memcg = pc->mem_cgroup; | ||
3516 | ClearPageCgroupUsed(pc); | ||
3517 | } | ||
3518 | unlock_page_cgroup(pc); | ||
3519 | |||
3520 | /* | ||
3521 | * We trust that only if there is a memcg associated with the page, it | ||
3522 | * is a valid allocation | ||
3523 | */ | ||
3524 | if (!memcg) | ||
3525 | return; | ||
3526 | |||
3527 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | ||
3528 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | ||
3529 | } | ||
3530 | #else | ||
3531 | static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) | ||
3532 | { | ||
3533 | } | ||
3534 | #endif /* CONFIG_MEMCG_KMEM */ | ||
3535 | |||
2629 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 3536 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2630 | 3537 | ||
2631 | #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) | 3538 | #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) |
@@ -2709,13 +3616,6 @@ static int mem_cgroup_move_account(struct page *page, | |||
2709 | /* caller should have done css_get */ | 3616 | /* caller should have done css_get */ |
2710 | pc->mem_cgroup = to; | 3617 | pc->mem_cgroup = to; |
2711 | mem_cgroup_charge_statistics(to, anon, nr_pages); | 3618 | mem_cgroup_charge_statistics(to, anon, nr_pages); |
2712 | /* | ||
2713 | * We charges against "to" which may not have any tasks. Then, "to" | ||
2714 | * can be under rmdir(). But in current implementation, caller of | ||
2715 | * this function is just force_empty() and move charge, so it's | ||
2716 | * guaranteed that "to" is never removed. So, we don't check rmdir | ||
2717 | * status here. | ||
2718 | */ | ||
2719 | move_unlock_mem_cgroup(from, &flags); | 3619 | move_unlock_mem_cgroup(from, &flags); |
2720 | ret = 0; | 3620 | ret = 0; |
2721 | unlock: | 3621 | unlock: |
@@ -2729,10 +3629,27 @@ out: | |||
2729 | return ret; | 3629 | return ret; |
2730 | } | 3630 | } |
2731 | 3631 | ||
2732 | /* | 3632 | /** |
2733 | * move charges to its parent. | 3633 | * mem_cgroup_move_parent - moves page to the parent group |
3634 | * @page: the page to move | ||
3635 | * @pc: page_cgroup of the page | ||
3636 | * @child: page's cgroup | ||
3637 | * | ||
3638 | * move charges to its parent or the root cgroup if the group has no | ||
3639 | * parent (aka use_hierarchy==0). | ||
3640 | * Although this might fail (get_page_unless_zero, isolate_lru_page or | ||
3641 | * mem_cgroup_move_account fails) the failure is always temporary and | ||
3642 | * it signals a race with a page removal/uncharge or migration. In the | ||
3643 | * first case the page is on the way out and it will vanish from the LRU | ||
3644 | * on the next attempt and the call should be retried later. | ||
3645 | * Isolation from the LRU fails only if page has been isolated from | ||
3646 | * the LRU since we looked at it and that usually means either global | ||
3647 | * reclaim or migration going on. The page will either get back to the | ||
3648 | * LRU or vanish. | ||
3649 | * Finaly mem_cgroup_move_account fails only if the page got uncharged | ||
3650 | * (!PageCgroupUsed) or moved to a different group. The page will | ||
3651 | * disappear in the next attempt. | ||
2734 | */ | 3652 | */ |
2735 | |||
2736 | static int mem_cgroup_move_parent(struct page *page, | 3653 | static int mem_cgroup_move_parent(struct page *page, |
2737 | struct page_cgroup *pc, | 3654 | struct page_cgroup *pc, |
2738 | struct mem_cgroup *child) | 3655 | struct mem_cgroup *child) |
@@ -2742,9 +3659,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2742 | unsigned long uninitialized_var(flags); | 3659 | unsigned long uninitialized_var(flags); |
2743 | int ret; | 3660 | int ret; |
2744 | 3661 | ||
2745 | /* Is ROOT ? */ | 3662 | VM_BUG_ON(mem_cgroup_is_root(child)); |
2746 | if (mem_cgroup_is_root(child)) | ||
2747 | return -EINVAL; | ||
2748 | 3663 | ||
2749 | ret = -EBUSY; | 3664 | ret = -EBUSY; |
2750 | if (!get_page_unless_zero(page)) | 3665 | if (!get_page_unless_zero(page)) |
@@ -2761,8 +3676,10 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2761 | if (!parent) | 3676 | if (!parent) |
2762 | parent = root_mem_cgroup; | 3677 | parent = root_mem_cgroup; |
2763 | 3678 | ||
2764 | if (nr_pages > 1) | 3679 | if (nr_pages > 1) { |
3680 | VM_BUG_ON(!PageTransHuge(page)); | ||
2765 | flags = compound_lock_irqsave(page); | 3681 | flags = compound_lock_irqsave(page); |
3682 | } | ||
2766 | 3683 | ||
2767 | ret = mem_cgroup_move_account(page, nr_pages, | 3684 | ret = mem_cgroup_move_account(page, nr_pages, |
2768 | pc, child, parent); | 3685 | pc, child, parent); |
@@ -2904,7 +3821,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2904 | return; | 3821 | return; |
2905 | if (!memcg) | 3822 | if (!memcg) |
2906 | return; | 3823 | return; |
2907 | cgroup_exclude_rmdir(&memcg->css); | ||
2908 | 3824 | ||
2909 | __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); | 3825 | __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); |
2910 | /* | 3826 | /* |
@@ -2918,12 +3834,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2918 | swp_entry_t ent = {.val = page_private(page)}; | 3834 | swp_entry_t ent = {.val = page_private(page)}; |
2919 | mem_cgroup_uncharge_swap(ent); | 3835 | mem_cgroup_uncharge_swap(ent); |
2920 | } | 3836 | } |
2921 | /* | ||
2922 | * At swapin, we may charge account against cgroup which has no tasks. | ||
2923 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
2924 | * In that case, we need to call pre_destroy() again. check it here. | ||
2925 | */ | ||
2926 | cgroup_release_and_wakeup_rmdir(&memcg->css); | ||
2927 | } | 3837 | } |
2928 | 3838 | ||
2929 | void mem_cgroup_commit_charge_swapin(struct page *page, | 3839 | void mem_cgroup_commit_charge_swapin(struct page *page, |
@@ -3288,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3288 | struct mem_cgroup **memcgp) | 4198 | struct mem_cgroup **memcgp) |
3289 | { | 4199 | { |
3290 | struct mem_cgroup *memcg = NULL; | 4200 | struct mem_cgroup *memcg = NULL; |
4201 | unsigned int nr_pages = 1; | ||
3291 | struct page_cgroup *pc; | 4202 | struct page_cgroup *pc; |
3292 | enum charge_type ctype; | 4203 | enum charge_type ctype; |
3293 | 4204 | ||
3294 | *memcgp = NULL; | 4205 | *memcgp = NULL; |
3295 | 4206 | ||
3296 | VM_BUG_ON(PageTransHuge(page)); | ||
3297 | if (mem_cgroup_disabled()) | 4207 | if (mem_cgroup_disabled()) |
3298 | return; | 4208 | return; |
3299 | 4209 | ||
4210 | if (PageTransHuge(page)) | ||
4211 | nr_pages <<= compound_order(page); | ||
4212 | |||
3300 | pc = lookup_page_cgroup(page); | 4213 | pc = lookup_page_cgroup(page); |
3301 | lock_page_cgroup(pc); | 4214 | lock_page_cgroup(pc); |
3302 | if (PageCgroupUsed(pc)) { | 4215 | if (PageCgroupUsed(pc)) { |
@@ -3358,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3358 | * charged to the res_counter since we plan on replacing the | 4271 | * charged to the res_counter since we plan on replacing the |
3359 | * old one and only one page is going to be left afterwards. | 4272 | * old one and only one page is going to be left afterwards. |
3360 | */ | 4273 | */ |
3361 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | 4274 | __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); |
3362 | } | 4275 | } |
3363 | 4276 | ||
3364 | /* remove redundant charge if migration failed*/ | 4277 | /* remove redundant charge if migration failed*/ |
@@ -3371,8 +4284,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3371 | 4284 | ||
3372 | if (!memcg) | 4285 | if (!memcg) |
3373 | return; | 4286 | return; |
3374 | /* blocks rmdir() */ | 4287 | |
3375 | cgroup_exclude_rmdir(&memcg->css); | ||
3376 | if (!migration_ok) { | 4288 | if (!migration_ok) { |
3377 | used = oldpage; | 4289 | used = oldpage; |
3378 | unused = newpage; | 4290 | unused = newpage; |
@@ -3406,13 +4318,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3406 | */ | 4318 | */ |
3407 | if (anon) | 4319 | if (anon) |
3408 | mem_cgroup_uncharge_page(used); | 4320 | mem_cgroup_uncharge_page(used); |
3409 | /* | ||
3410 | * At migration, we may charge account against cgroup which has no | ||
3411 | * tasks. | ||
3412 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
3413 | * In that case, we need to call pre_destroy() again. check it here. | ||
3414 | */ | ||
3415 | cgroup_release_and_wakeup_rmdir(&memcg->css); | ||
3416 | } | 4321 | } |
3417 | 4322 | ||
3418 | /* | 4323 | /* |
@@ -3490,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page) | |||
3490 | } | 4395 | } |
3491 | #endif | 4396 | #endif |
3492 | 4397 | ||
3493 | static DEFINE_MUTEX(set_limit_mutex); | ||
3494 | |||
3495 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 4398 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
3496 | unsigned long long val) | 4399 | unsigned long long val) |
3497 | { | 4400 | { |
@@ -3712,17 +4615,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3712 | return nr_reclaimed; | 4615 | return nr_reclaimed; |
3713 | } | 4616 | } |
3714 | 4617 | ||
3715 | /* | 4618 | /** |
4619 | * mem_cgroup_force_empty_list - clears LRU of a group | ||
4620 | * @memcg: group to clear | ||
4621 | * @node: NUMA node | ||
4622 | * @zid: zone id | ||
4623 | * @lru: lru to to clear | ||
4624 | * | ||
3716 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't | 4625 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3717 | * reclaim the pages page themselves - it just removes the page_cgroups. | 4626 | * reclaim the pages page themselves - pages are moved to the parent (or root) |
3718 | * Returns true if some page_cgroups were not freed, indicating that the caller | 4627 | * group. |
3719 | * must retry this operation. | ||
3720 | */ | 4628 | */ |
3721 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 4629 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3722 | int node, int zid, enum lru_list lru) | 4630 | int node, int zid, enum lru_list lru) |
3723 | { | 4631 | { |
3724 | struct lruvec *lruvec; | 4632 | struct lruvec *lruvec; |
3725 | unsigned long flags, loop; | 4633 | unsigned long flags; |
3726 | struct list_head *list; | 4634 | struct list_head *list; |
3727 | struct page *busy; | 4635 | struct page *busy; |
3728 | struct zone *zone; | 4636 | struct zone *zone; |
@@ -3731,11 +4639,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3731 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 4639 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
3732 | list = &lruvec->lists[lru]; | 4640 | list = &lruvec->lists[lru]; |
3733 | 4641 | ||
3734 | loop = mem_cgroup_get_lru_size(lruvec, lru); | ||
3735 | /* give some margin against EBUSY etc...*/ | ||
3736 | loop += 256; | ||
3737 | busy = NULL; | 4642 | busy = NULL; |
3738 | while (loop--) { | 4643 | do { |
3739 | struct page_cgroup *pc; | 4644 | struct page_cgroup *pc; |
3740 | struct page *page; | 4645 | struct page *page; |
3741 | 4646 | ||
@@ -3761,76 +4666,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3761 | cond_resched(); | 4666 | cond_resched(); |
3762 | } else | 4667 | } else |
3763 | busy = NULL; | 4668 | busy = NULL; |
3764 | } | 4669 | } while (!list_empty(list)); |
3765 | return !list_empty(list); | ||
3766 | } | 4670 | } |
3767 | 4671 | ||
3768 | /* | 4672 | /* |
3769 | * make mem_cgroup's charge to be 0 if there is no task. | 4673 | * make mem_cgroup's charge to be 0 if there is no task by moving |
4674 | * all the charges and pages to the parent. | ||
3770 | * This enables deleting this mem_cgroup. | 4675 | * This enables deleting this mem_cgroup. |
4676 | * | ||
4677 | * Caller is responsible for holding css reference on the memcg. | ||
3771 | */ | 4678 | */ |
3772 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) | 4679 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) |
3773 | { | 4680 | { |
3774 | int ret; | 4681 | int node, zid; |
3775 | int node, zid, shrink; | 4682 | u64 usage; |
3776 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
3777 | struct cgroup *cgrp = memcg->css.cgroup; | ||
3778 | |||
3779 | css_get(&memcg->css); | ||
3780 | 4683 | ||
3781 | shrink = 0; | ||
3782 | /* should free all ? */ | ||
3783 | if (free_all) | ||
3784 | goto try_to_free; | ||
3785 | move_account: | ||
3786 | do { | 4684 | do { |
3787 | ret = -EBUSY; | ||
3788 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | ||
3789 | goto out; | ||
3790 | /* This is for making all *used* pages to be on LRU. */ | 4685 | /* This is for making all *used* pages to be on LRU. */ |
3791 | lru_add_drain_all(); | 4686 | lru_add_drain_all(); |
3792 | drain_all_stock_sync(memcg); | 4687 | drain_all_stock_sync(memcg); |
3793 | ret = 0; | ||
3794 | mem_cgroup_start_move(memcg); | 4688 | mem_cgroup_start_move(memcg); |
3795 | for_each_node_state(node, N_HIGH_MEMORY) { | 4689 | for_each_node_state(node, N_MEMORY) { |
3796 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 4690 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
3797 | enum lru_list lru; | 4691 | enum lru_list lru; |
3798 | for_each_lru(lru) { | 4692 | for_each_lru(lru) { |
3799 | ret = mem_cgroup_force_empty_list(memcg, | 4693 | mem_cgroup_force_empty_list(memcg, |
3800 | node, zid, lru); | 4694 | node, zid, lru); |
3801 | if (ret) | ||
3802 | break; | ||
3803 | } | 4695 | } |
3804 | } | 4696 | } |
3805 | if (ret) | ||
3806 | break; | ||
3807 | } | 4697 | } |
3808 | mem_cgroup_end_move(memcg); | 4698 | mem_cgroup_end_move(memcg); |
3809 | memcg_oom_recover(memcg); | 4699 | memcg_oom_recover(memcg); |
3810 | cond_resched(); | 4700 | cond_resched(); |
3811 | /* "ret" should also be checked to ensure all lists are empty. */ | ||
3812 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); | ||
3813 | out: | ||
3814 | css_put(&memcg->css); | ||
3815 | return ret; | ||
3816 | 4701 | ||
3817 | try_to_free: | 4702 | /* |
4703 | * Kernel memory may not necessarily be trackable to a specific | ||
4704 | * process. So they are not migrated, and therefore we can't | ||
4705 | * expect their value to drop to 0 here. | ||
4706 | * Having res filled up with kmem only is enough. | ||
4707 | * | ||
4708 | * This is a safety check because mem_cgroup_force_empty_list | ||
4709 | * could have raced with mem_cgroup_replace_page_cache callers | ||
4710 | * so the lru seemed empty but the page could have been added | ||
4711 | * right after the check. RES_USAGE should be safe as we always | ||
4712 | * charge before adding to the LRU. | ||
4713 | */ | ||
4714 | usage = res_counter_read_u64(&memcg->res, RES_USAGE) - | ||
4715 | res_counter_read_u64(&memcg->kmem, RES_USAGE); | ||
4716 | } while (usage > 0); | ||
4717 | } | ||
4718 | |||
4719 | /* | ||
4720 | * Reclaims as many pages from the given memcg as possible and moves | ||
4721 | * the rest to the parent. | ||
4722 | * | ||
4723 | * Caller is responsible for holding css reference for memcg. | ||
4724 | */ | ||
4725 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | ||
4726 | { | ||
4727 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
4728 | struct cgroup *cgrp = memcg->css.cgroup; | ||
4729 | |||
3818 | /* returns EBUSY if there is a task or if we come here twice. */ | 4730 | /* returns EBUSY if there is a task or if we come here twice. */ |
3819 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { | 4731 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3820 | ret = -EBUSY; | 4732 | return -EBUSY; |
3821 | goto out; | 4733 | |
3822 | } | ||
3823 | /* we call try-to-free pages for make this cgroup empty */ | 4734 | /* we call try-to-free pages for make this cgroup empty */ |
3824 | lru_add_drain_all(); | 4735 | lru_add_drain_all(); |
3825 | /* try to free all pages in this cgroup */ | 4736 | /* try to free all pages in this cgroup */ |
3826 | shrink = 1; | ||
3827 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { | 4737 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { |
3828 | int progress; | 4738 | int progress; |
3829 | 4739 | ||
3830 | if (signal_pending(current)) { | 4740 | if (signal_pending(current)) |
3831 | ret = -EINTR; | 4741 | return -EINTR; |
3832 | goto out; | 4742 | |
3833 | } | ||
3834 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, | 4743 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, |
3835 | false); | 4744 | false); |
3836 | if (!progress) { | 4745 | if (!progress) { |
@@ -3841,13 +4750,23 @@ try_to_free: | |||
3841 | 4750 | ||
3842 | } | 4751 | } |
3843 | lru_add_drain(); | 4752 | lru_add_drain(); |
3844 | /* try move_account...there may be some *locked* pages. */ | 4753 | mem_cgroup_reparent_charges(memcg); |
3845 | goto move_account; | 4754 | |
4755 | return 0; | ||
3846 | } | 4756 | } |
3847 | 4757 | ||
3848 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 4758 | static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
3849 | { | 4759 | { |
3850 | return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); | 4760 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4761 | int ret; | ||
4762 | |||
4763 | if (mem_cgroup_is_root(memcg)) | ||
4764 | return -EINVAL; | ||
4765 | css_get(&memcg->css); | ||
4766 | ret = mem_cgroup_force_empty(memcg); | ||
4767 | css_put(&memcg->css); | ||
4768 | |||
4769 | return ret; | ||
3851 | } | 4770 | } |
3852 | 4771 | ||
3853 | 4772 | ||
@@ -3938,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |||
3938 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4857 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3939 | char str[64]; | 4858 | char str[64]; |
3940 | u64 val; | 4859 | u64 val; |
3941 | int type, name, len; | 4860 | int name, len; |
4861 | enum res_type type; | ||
3942 | 4862 | ||
3943 | type = MEMFILE_TYPE(cft->private); | 4863 | type = MEMFILE_TYPE(cft->private); |
3944 | name = MEMFILE_ATTR(cft->private); | 4864 | name = MEMFILE_ATTR(cft->private); |
@@ -3959,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |||
3959 | else | 4879 | else |
3960 | val = res_counter_read_u64(&memcg->memsw, name); | 4880 | val = res_counter_read_u64(&memcg->memsw, name); |
3961 | break; | 4881 | break; |
4882 | case _KMEM: | ||
4883 | val = res_counter_read_u64(&memcg->kmem, name); | ||
4884 | break; | ||
3962 | default: | 4885 | default: |
3963 | BUG(); | 4886 | BUG(); |
3964 | } | 4887 | } |
@@ -3966,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | |||
3966 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | 4889 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); |
3967 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | 4890 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); |
3968 | } | 4891 | } |
4892 | |||
4893 | static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | ||
4894 | { | ||
4895 | int ret = -EINVAL; | ||
4896 | #ifdef CONFIG_MEMCG_KMEM | ||
4897 | bool must_inc_static_branch = false; | ||
4898 | |||
4899 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | ||
4900 | /* | ||
4901 | * For simplicity, we won't allow this to be disabled. It also can't | ||
4902 | * be changed if the cgroup has children already, or if tasks had | ||
4903 | * already joined. | ||
4904 | * | ||
4905 | * If tasks join before we set the limit, a person looking at | ||
4906 | * kmem.usage_in_bytes will have no way to determine when it took | ||
4907 | * place, which makes the value quite meaningless. | ||
4908 | * | ||
4909 | * After it first became limited, changes in the value of the limit are | ||
4910 | * of course permitted. | ||
4911 | * | ||
4912 | * Taking the cgroup_lock is really offensive, but it is so far the only | ||
4913 | * way to guarantee that no children will appear. There are plenty of | ||
4914 | * other offenders, and they should all go away. Fine grained locking | ||
4915 | * is probably the way to go here. When we are fully hierarchical, we | ||
4916 | * can also get rid of the use_hierarchy check. | ||
4917 | */ | ||
4918 | cgroup_lock(); | ||
4919 | mutex_lock(&set_limit_mutex); | ||
4920 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | ||
4921 | if (cgroup_task_count(cont) || (memcg->use_hierarchy && | ||
4922 | !list_empty(&cont->children))) { | ||
4923 | ret = -EBUSY; | ||
4924 | goto out; | ||
4925 | } | ||
4926 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
4927 | VM_BUG_ON(ret); | ||
4928 | |||
4929 | ret = memcg_update_cache_sizes(memcg); | ||
4930 | if (ret) { | ||
4931 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | ||
4932 | goto out; | ||
4933 | } | ||
4934 | must_inc_static_branch = true; | ||
4935 | /* | ||
4936 | * kmem charges can outlive the cgroup. In the case of slab | ||
4937 | * pages, for instance, a page contain objects from various | ||
4938 | * processes, so it is unfeasible to migrate them away. We | ||
4939 | * need to reference count the memcg because of that. | ||
4940 | */ | ||
4941 | mem_cgroup_get(memcg); | ||
4942 | } else | ||
4943 | ret = res_counter_set_limit(&memcg->kmem, val); | ||
4944 | out: | ||
4945 | mutex_unlock(&set_limit_mutex); | ||
4946 | cgroup_unlock(); | ||
4947 | |||
4948 | /* | ||
4949 | * We are by now familiar with the fact that we can't inc the static | ||
4950 | * branch inside cgroup_lock. See disarm functions for details. A | ||
4951 | * worker here is overkill, but also wrong: After the limit is set, we | ||
4952 | * must start accounting right away. Since this operation can't fail, | ||
4953 | * we can safely defer it to here - no rollback will be needed. | ||
4954 | * | ||
4955 | * The boolean used to control this is also safe, because | ||
4956 | * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be | ||
4957 | * able to set it to true; | ||
4958 | */ | ||
4959 | if (must_inc_static_branch) { | ||
4960 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
4961 | /* | ||
4962 | * setting the active bit after the inc will guarantee no one | ||
4963 | * starts accounting before all call sites are patched | ||
4964 | */ | ||
4965 | memcg_kmem_set_active(memcg); | ||
4966 | } | ||
4967 | |||
4968 | #endif | ||
4969 | return ret; | ||
4970 | } | ||
4971 | |||
4972 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | ||
4973 | { | ||
4974 | int ret = 0; | ||
4975 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | ||
4976 | if (!parent) | ||
4977 | goto out; | ||
4978 | |||
4979 | memcg->kmem_account_flags = parent->kmem_account_flags; | ||
4980 | #ifdef CONFIG_MEMCG_KMEM | ||
4981 | /* | ||
4982 | * When that happen, we need to disable the static branch only on those | ||
4983 | * memcgs that enabled it. To achieve this, we would be forced to | ||
4984 | * complicate the code by keeping track of which memcgs were the ones | ||
4985 | * that actually enabled limits, and which ones got it from its | ||
4986 | * parents. | ||
4987 | * | ||
4988 | * It is a lot simpler just to do static_key_slow_inc() on every child | ||
4989 | * that is accounted. | ||
4990 | */ | ||
4991 | if (!memcg_kmem_is_active(memcg)) | ||
4992 | goto out; | ||
4993 | |||
4994 | /* | ||
4995 | * destroy(), called if we fail, will issue static_key_slow_inc() and | ||
4996 | * mem_cgroup_put() if kmem is enabled. We have to either call them | ||
4997 | * unconditionally, or clear the KMEM_ACTIVE flag. I personally find | ||
4998 | * this more consistent, since it always leads to the same destroy path | ||
4999 | */ | ||
5000 | mem_cgroup_get(memcg); | ||
5001 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
5002 | |||
5003 | mutex_lock(&set_limit_mutex); | ||
5004 | ret = memcg_update_cache_sizes(memcg); | ||
5005 | mutex_unlock(&set_limit_mutex); | ||
5006 | #endif | ||
5007 | out: | ||
5008 | return ret; | ||
5009 | } | ||
5010 | |||
3969 | /* | 5011 | /* |
3970 | * The user of this function is... | 5012 | * The user of this function is... |
3971 | * RES_LIMIT. | 5013 | * RES_LIMIT. |
@@ -3974,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
3974 | const char *buffer) | 5016 | const char *buffer) |
3975 | { | 5017 | { |
3976 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5018 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
3977 | int type, name; | 5019 | enum res_type type; |
5020 | int name; | ||
3978 | unsigned long long val; | 5021 | unsigned long long val; |
3979 | int ret; | 5022 | int ret; |
3980 | 5023 | ||
@@ -3996,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
3996 | break; | 5039 | break; |
3997 | if (type == _MEM) | 5040 | if (type == _MEM) |
3998 | ret = mem_cgroup_resize_limit(memcg, val); | 5041 | ret = mem_cgroup_resize_limit(memcg, val); |
3999 | else | 5042 | else if (type == _MEMSWAP) |
4000 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 5043 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
5044 | else if (type == _KMEM) | ||
5045 | ret = memcg_update_kmem_limit(cont, val); | ||
5046 | else | ||
5047 | return -EINVAL; | ||
4001 | break; | 5048 | break; |
4002 | case RES_SOFT_LIMIT: | 5049 | case RES_SOFT_LIMIT: |
4003 | ret = res_counter_memparse_write_strategy(buffer, &val); | 5050 | ret = res_counter_memparse_write_strategy(buffer, &val); |
@@ -4050,7 +5097,8 @@ out: | |||
4050 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 5097 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
4051 | { | 5098 | { |
4052 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 5099 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4053 | int type, name; | 5100 | int name; |
5101 | enum res_type type; | ||
4054 | 5102 | ||
4055 | type = MEMFILE_TYPE(event); | 5103 | type = MEMFILE_TYPE(event); |
4056 | name = MEMFILE_ATTR(event); | 5104 | name = MEMFILE_ATTR(event); |
@@ -4062,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
4062 | case RES_MAX_USAGE: | 5110 | case RES_MAX_USAGE: |
4063 | if (type == _MEM) | 5111 | if (type == _MEM) |
4064 | res_counter_reset_max(&memcg->res); | 5112 | res_counter_reset_max(&memcg->res); |
4065 | else | 5113 | else if (type == _MEMSWAP) |
4066 | res_counter_reset_max(&memcg->memsw); | 5114 | res_counter_reset_max(&memcg->memsw); |
5115 | else if (type == _KMEM) | ||
5116 | res_counter_reset_max(&memcg->kmem); | ||
5117 | else | ||
5118 | return -EINVAL; | ||
4067 | break; | 5119 | break; |
4068 | case RES_FAILCNT: | 5120 | case RES_FAILCNT: |
4069 | if (type == _MEM) | 5121 | if (type == _MEM) |
4070 | res_counter_reset_failcnt(&memcg->res); | 5122 | res_counter_reset_failcnt(&memcg->res); |
4071 | else | 5123 | else if (type == _MEMSWAP) |
4072 | res_counter_reset_failcnt(&memcg->memsw); | 5124 | res_counter_reset_failcnt(&memcg->memsw); |
5125 | else if (type == _KMEM) | ||
5126 | res_counter_reset_failcnt(&memcg->kmem); | ||
5127 | else | ||
5128 | return -EINVAL; | ||
4073 | break; | 5129 | break; |
4074 | } | 5130 | } |
4075 | 5131 | ||
@@ -4120,7 +5176,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4120 | 5176 | ||
4121 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 5177 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
4122 | seq_printf(m, "total=%lu", total_nr); | 5178 | seq_printf(m, "total=%lu", total_nr); |
4123 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5179 | for_each_node_state(nid, N_MEMORY) { |
4124 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); | 5180 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); |
4125 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5181 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4126 | } | 5182 | } |
@@ -4128,7 +5184,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4128 | 5184 | ||
4129 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); | 5185 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); |
4130 | seq_printf(m, "file=%lu", file_nr); | 5186 | seq_printf(m, "file=%lu", file_nr); |
4131 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5187 | for_each_node_state(nid, N_MEMORY) { |
4132 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5188 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4133 | LRU_ALL_FILE); | 5189 | LRU_ALL_FILE); |
4134 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5190 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4137,7 +5193,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4137 | 5193 | ||
4138 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); | 5194 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); |
4139 | seq_printf(m, "anon=%lu", anon_nr); | 5195 | seq_printf(m, "anon=%lu", anon_nr); |
4140 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5196 | for_each_node_state(nid, N_MEMORY) { |
4141 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5197 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4142 | LRU_ALL_ANON); | 5198 | LRU_ALL_ANON); |
4143 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5199 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4146,7 +5202,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4146 | 5202 | ||
4147 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | 5203 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4148 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 5204 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4149 | for_each_node_state(nid, N_HIGH_MEMORY) { | 5205 | for_each_node_state(nid, N_MEMORY) { |
4150 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 5206 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4151 | BIT(LRU_UNEVICTABLE)); | 5207 | BIT(LRU_UNEVICTABLE)); |
4152 | seq_printf(m, " N%d=%lu", nid, node_nr); | 5208 | seq_printf(m, " N%d=%lu", nid, node_nr); |
@@ -4386,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | |||
4386 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5442 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4387 | struct mem_cgroup_thresholds *thresholds; | 5443 | struct mem_cgroup_thresholds *thresholds; |
4388 | struct mem_cgroup_threshold_ary *new; | 5444 | struct mem_cgroup_threshold_ary *new; |
4389 | int type = MEMFILE_TYPE(cft->private); | 5445 | enum res_type type = MEMFILE_TYPE(cft->private); |
4390 | u64 threshold, usage; | 5446 | u64 threshold, usage; |
4391 | int i, size, ret; | 5447 | int i, size, ret; |
4392 | 5448 | ||
@@ -4469,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4469 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5525 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4470 | struct mem_cgroup_thresholds *thresholds; | 5526 | struct mem_cgroup_thresholds *thresholds; |
4471 | struct mem_cgroup_threshold_ary *new; | 5527 | struct mem_cgroup_threshold_ary *new; |
4472 | int type = MEMFILE_TYPE(cft->private); | 5528 | enum res_type type = MEMFILE_TYPE(cft->private); |
4473 | u64 usage; | 5529 | u64 usage; |
4474 | int i, j, size; | 5530 | int i, j, size; |
4475 | 5531 | ||
@@ -4547,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | |||
4547 | { | 5603 | { |
4548 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5604 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4549 | struct mem_cgroup_eventfd_list *event; | 5605 | struct mem_cgroup_eventfd_list *event; |
4550 | int type = MEMFILE_TYPE(cft->private); | 5606 | enum res_type type = MEMFILE_TYPE(cft->private); |
4551 | 5607 | ||
4552 | BUG_ON(type != _OOM_TYPE); | 5608 | BUG_ON(type != _OOM_TYPE); |
4553 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 5609 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
@@ -4572,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4572 | { | 5628 | { |
4573 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 5629 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4574 | struct mem_cgroup_eventfd_list *ev, *tmp; | 5630 | struct mem_cgroup_eventfd_list *ev, *tmp; |
4575 | int type = MEMFILE_TYPE(cft->private); | 5631 | enum res_type type = MEMFILE_TYPE(cft->private); |
4576 | 5632 | ||
4577 | BUG_ON(type != _OOM_TYPE); | 5633 | BUG_ON(type != _OOM_TYPE); |
4578 | 5634 | ||
@@ -4631,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4631 | #ifdef CONFIG_MEMCG_KMEM | 5687 | #ifdef CONFIG_MEMCG_KMEM |
4632 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 5688 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4633 | { | 5689 | { |
5690 | int ret; | ||
5691 | |||
5692 | memcg->kmemcg_id = -1; | ||
5693 | ret = memcg_propagate_kmem(memcg); | ||
5694 | if (ret) | ||
5695 | return ret; | ||
5696 | |||
4634 | return mem_cgroup_sockets_init(memcg, ss); | 5697 | return mem_cgroup_sockets_init(memcg, ss); |
4635 | }; | 5698 | }; |
4636 | 5699 | ||
4637 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) | 5700 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) |
4638 | { | 5701 | { |
4639 | mem_cgroup_sockets_destroy(memcg); | 5702 | mem_cgroup_sockets_destroy(memcg); |
5703 | |||
5704 | memcg_kmem_mark_dead(memcg); | ||
5705 | |||
5706 | if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) | ||
5707 | return; | ||
5708 | |||
5709 | /* | ||
5710 | * Charges already down to 0, undo mem_cgroup_get() done in the charge | ||
5711 | * path here, being careful not to race with memcg_uncharge_kmem: it is | ||
5712 | * possible that the charges went down to 0 between mark_dead and the | ||
5713 | * res_counter read, so in that case, we don't need the put | ||
5714 | */ | ||
5715 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
5716 | mem_cgroup_put(memcg); | ||
4640 | } | 5717 | } |
4641 | #else | 5718 | #else |
4642 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 5719 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
@@ -4745,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = { | |||
4745 | .read = mem_cgroup_read, | 5822 | .read = mem_cgroup_read, |
4746 | }, | 5823 | }, |
4747 | #endif | 5824 | #endif |
5825 | #ifdef CONFIG_MEMCG_KMEM | ||
5826 | { | ||
5827 | .name = "kmem.limit_in_bytes", | ||
5828 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | ||
5829 | .write_string = mem_cgroup_write, | ||
5830 | .read = mem_cgroup_read, | ||
5831 | }, | ||
5832 | { | ||
5833 | .name = "kmem.usage_in_bytes", | ||
5834 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | ||
5835 | .read = mem_cgroup_read, | ||
5836 | }, | ||
5837 | { | ||
5838 | .name = "kmem.failcnt", | ||
5839 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), | ||
5840 | .trigger = mem_cgroup_reset, | ||
5841 | .read = mem_cgroup_read, | ||
5842 | }, | ||
5843 | { | ||
5844 | .name = "kmem.max_usage_in_bytes", | ||
5845 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), | ||
5846 | .trigger = mem_cgroup_reset, | ||
5847 | .read = mem_cgroup_read, | ||
5848 | }, | ||
5849 | #ifdef CONFIG_SLABINFO | ||
5850 | { | ||
5851 | .name = "kmem.slabinfo", | ||
5852 | .read_seq_string = mem_cgroup_slabinfo_read, | ||
5853 | }, | ||
5854 | #endif | ||
5855 | #endif | ||
4748 | { }, /* terminate */ | 5856 | { }, /* terminate */ |
4749 | }; | 5857 | }; |
4750 | 5858 | ||
@@ -4812,16 +5920,29 @@ out_free: | |||
4812 | } | 5920 | } |
4813 | 5921 | ||
4814 | /* | 5922 | /* |
4815 | * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, | 5923 | * At destroying mem_cgroup, references from swap_cgroup can remain. |
4816 | * but in process context. The work_freeing structure is overlaid | 5924 | * (scanning all at force_empty is too costly...) |
4817 | * on the rcu_freeing structure, which itself is overlaid on memsw. | 5925 | * |
5926 | * Instead of clearing all references at force_empty, we remember | ||
5927 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
5928 | * it goes down to 0. | ||
5929 | * | ||
5930 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
4818 | */ | 5931 | */ |
4819 | static void free_work(struct work_struct *work) | 5932 | |
5933 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | ||
4820 | { | 5934 | { |
4821 | struct mem_cgroup *memcg; | 5935 | int node; |
4822 | int size = sizeof(struct mem_cgroup); | 5936 | int size = sizeof(struct mem_cgroup); |
4823 | 5937 | ||
4824 | memcg = container_of(work, struct mem_cgroup, work_freeing); | 5938 | mem_cgroup_remove_from_trees(memcg); |
5939 | free_css_id(&mem_cgroup_subsys, &memcg->css); | ||
5940 | |||
5941 | for_each_node(node) | ||
5942 | free_mem_cgroup_per_zone_info(memcg, node); | ||
5943 | |||
5944 | free_percpu(memcg->stat); | ||
5945 | |||
4825 | /* | 5946 | /* |
4826 | * We need to make sure that (at least for now), the jump label | 5947 | * We need to make sure that (at least for now), the jump label |
4827 | * destruction code runs outside of the cgroup lock. This is because | 5948 | * destruction code runs outside of the cgroup lock. This is because |
@@ -4833,45 +5954,34 @@ static void free_work(struct work_struct *work) | |||
4833 | * to move this code around, and make sure it is outside | 5954 | * to move this code around, and make sure it is outside |
4834 | * the cgroup_lock. | 5955 | * the cgroup_lock. |
4835 | */ | 5956 | */ |
4836 | disarm_sock_keys(memcg); | 5957 | disarm_static_keys(memcg); |
4837 | if (size < PAGE_SIZE) | 5958 | if (size < PAGE_SIZE) |
4838 | kfree(memcg); | 5959 | kfree(memcg); |
4839 | else | 5960 | else |
4840 | vfree(memcg); | 5961 | vfree(memcg); |
4841 | } | 5962 | } |
4842 | 5963 | ||
4843 | static void free_rcu(struct rcu_head *rcu_head) | ||
4844 | { | ||
4845 | struct mem_cgroup *memcg; | ||
4846 | |||
4847 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); | ||
4848 | INIT_WORK(&memcg->work_freeing, free_work); | ||
4849 | schedule_work(&memcg->work_freeing); | ||
4850 | } | ||
4851 | 5964 | ||
4852 | /* | 5965 | /* |
4853 | * At destroying mem_cgroup, references from swap_cgroup can remain. | 5966 | * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, |
4854 | * (scanning all at force_empty is too costly...) | 5967 | * but in process context. The work_freeing structure is overlaid |
4855 | * | 5968 | * on the rcu_freeing structure, which itself is overlaid on memsw. |
4856 | * Instead of clearing all references at force_empty, we remember | ||
4857 | * the number of reference from swap_cgroup and free mem_cgroup when | ||
4858 | * it goes down to 0. | ||
4859 | * | ||
4860 | * Removal of cgroup itself succeeds regardless of refs from swap. | ||
4861 | */ | 5969 | */ |
4862 | 5970 | static void free_work(struct work_struct *work) | |
4863 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | ||
4864 | { | 5971 | { |
4865 | int node; | 5972 | struct mem_cgroup *memcg; |
4866 | 5973 | ||
4867 | mem_cgroup_remove_from_trees(memcg); | 5974 | memcg = container_of(work, struct mem_cgroup, work_freeing); |
4868 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5975 | __mem_cgroup_free(memcg); |
5976 | } | ||
4869 | 5977 | ||
4870 | for_each_node(node) | 5978 | static void free_rcu(struct rcu_head *rcu_head) |
4871 | free_mem_cgroup_per_zone_info(memcg, node); | 5979 | { |
5980 | struct mem_cgroup *memcg; | ||
4872 | 5981 | ||
4873 | free_percpu(memcg->stat); | 5982 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); |
4874 | call_rcu(&memcg->rcu_freeing, free_rcu); | 5983 | INIT_WORK(&memcg->work_freeing, free_work); |
5984 | schedule_work(&memcg->work_freeing); | ||
4875 | } | 5985 | } |
4876 | 5986 | ||
4877 | static void mem_cgroup_get(struct mem_cgroup *memcg) | 5987 | static void mem_cgroup_get(struct mem_cgroup *memcg) |
@@ -4883,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) | |||
4883 | { | 5993 | { |
4884 | if (atomic_sub_and_test(count, &memcg->refcnt)) { | 5994 | if (atomic_sub_and_test(count, &memcg->refcnt)) { |
4885 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | 5995 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
4886 | __mem_cgroup_free(memcg); | 5996 | call_rcu(&memcg->rcu_freeing, free_rcu); |
4887 | if (parent) | 5997 | if (parent) |
4888 | mem_cgroup_put(parent); | 5998 | mem_cgroup_put(parent); |
4889 | } | 5999 | } |
@@ -4953,7 +6063,7 @@ err_cleanup: | |||
4953 | } | 6063 | } |
4954 | 6064 | ||
4955 | static struct cgroup_subsys_state * __ref | 6065 | static struct cgroup_subsys_state * __ref |
4956 | mem_cgroup_create(struct cgroup *cont) | 6066 | mem_cgroup_css_alloc(struct cgroup *cont) |
4957 | { | 6067 | { |
4958 | struct mem_cgroup *memcg, *parent; | 6068 | struct mem_cgroup *memcg, *parent; |
4959 | long error = -ENOMEM; | 6069 | long error = -ENOMEM; |
@@ -4990,6 +6100,8 @@ mem_cgroup_create(struct cgroup *cont) | |||
4990 | if (parent && parent->use_hierarchy) { | 6100 | if (parent && parent->use_hierarchy) { |
4991 | res_counter_init(&memcg->res, &parent->res); | 6101 | res_counter_init(&memcg->res, &parent->res); |
4992 | res_counter_init(&memcg->memsw, &parent->memsw); | 6102 | res_counter_init(&memcg->memsw, &parent->memsw); |
6103 | res_counter_init(&memcg->kmem, &parent->kmem); | ||
6104 | |||
4993 | /* | 6105 | /* |
4994 | * We increment refcnt of the parent to ensure that we can | 6106 | * We increment refcnt of the parent to ensure that we can |
4995 | * safely access it on res_counter_charge/uncharge. | 6107 | * safely access it on res_counter_charge/uncharge. |
@@ -5000,6 +6112,7 @@ mem_cgroup_create(struct cgroup *cont) | |||
5000 | } else { | 6112 | } else { |
5001 | res_counter_init(&memcg->res, NULL); | 6113 | res_counter_init(&memcg->res, NULL); |
5002 | res_counter_init(&memcg->memsw, NULL); | 6114 | res_counter_init(&memcg->memsw, NULL); |
6115 | res_counter_init(&memcg->kmem, NULL); | ||
5003 | /* | 6116 | /* |
5004 | * Deeper hierachy with use_hierarchy == false doesn't make | 6117 | * Deeper hierachy with use_hierarchy == false doesn't make |
5005 | * much sense so let cgroup subsystem know about this | 6118 | * much sense so let cgroup subsystem know about this |
@@ -5034,14 +6147,15 @@ free_out: | |||
5034 | return ERR_PTR(error); | 6147 | return ERR_PTR(error); |
5035 | } | 6148 | } |
5036 | 6149 | ||
5037 | static int mem_cgroup_pre_destroy(struct cgroup *cont) | 6150 | static void mem_cgroup_css_offline(struct cgroup *cont) |
5038 | { | 6151 | { |
5039 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 6152 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5040 | 6153 | ||
5041 | return mem_cgroup_force_empty(memcg, false); | 6154 | mem_cgroup_reparent_charges(memcg); |
6155 | mem_cgroup_destroy_all_caches(memcg); | ||
5042 | } | 6156 | } |
5043 | 6157 | ||
5044 | static void mem_cgroup_destroy(struct cgroup *cont) | 6158 | static void mem_cgroup_css_free(struct cgroup *cont) |
5045 | { | 6159 | { |
5046 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 6160 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
5047 | 6161 | ||
@@ -5631,16 +6745,15 @@ static void mem_cgroup_move_task(struct cgroup *cont, | |||
5631 | struct cgroup_subsys mem_cgroup_subsys = { | 6745 | struct cgroup_subsys mem_cgroup_subsys = { |
5632 | .name = "memory", | 6746 | .name = "memory", |
5633 | .subsys_id = mem_cgroup_subsys_id, | 6747 | .subsys_id = mem_cgroup_subsys_id, |
5634 | .create = mem_cgroup_create, | 6748 | .css_alloc = mem_cgroup_css_alloc, |
5635 | .pre_destroy = mem_cgroup_pre_destroy, | 6749 | .css_offline = mem_cgroup_css_offline, |
5636 | .destroy = mem_cgroup_destroy, | 6750 | .css_free = mem_cgroup_css_free, |
5637 | .can_attach = mem_cgroup_can_attach, | 6751 | .can_attach = mem_cgroup_can_attach, |
5638 | .cancel_attach = mem_cgroup_cancel_attach, | 6752 | .cancel_attach = mem_cgroup_cancel_attach, |
5639 | .attach = mem_cgroup_move_task, | 6753 | .attach = mem_cgroup_move_task, |
5640 | .base_cftypes = mem_cgroup_files, | 6754 | .base_cftypes = mem_cgroup_files, |
5641 | .early_init = 0, | 6755 | .early_init = 0, |
5642 | .use_id = 1, | 6756 | .use_id = 1, |
5643 | .__DEPRECATED_clear_css_refs = true, | ||
5644 | }; | 6757 | }; |
5645 | 6758 | ||
5646 | #ifdef CONFIG_MEMCG_SWAP | 6759 | #ifdef CONFIG_MEMCG_SWAP |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8b20278be6a6..c6e4dd3e1c08 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
402 | struct anon_vma *av; | 402 | struct anon_vma *av; |
403 | pgoff_t pgoff; | 403 | pgoff_t pgoff; |
404 | 404 | ||
405 | av = page_lock_anon_vma(page); | 405 | av = page_lock_anon_vma_read(page); |
406 | if (av == NULL) /* Not actually mapped anymore */ | 406 | if (av == NULL) /* Not actually mapped anymore */ |
407 | return; | 407 | return; |
408 | 408 | ||
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
423 | } | 423 | } |
424 | } | 424 | } |
425 | read_unlock(&tasklist_lock); | 425 | read_unlock(&tasklist_lock); |
426 | page_unlock_anon_vma(av); | 426 | page_unlock_anon_vma_read(av); |
427 | } | 427 | } |
428 | 428 | ||
429 | /* | 429 | /* |
@@ -781,16 +781,16 @@ static struct page_state { | |||
781 | { compound, compound, "huge", me_huge_page }, | 781 | { compound, compound, "huge", me_huge_page }, |
782 | #endif | 782 | #endif |
783 | 783 | ||
784 | { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, | 784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, |
785 | { sc|dirty, sc, "swapcache", me_swapcache_clean }, | 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
786 | 786 | ||
787 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | 787 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, |
788 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | 788 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, |
789 | 789 | ||
790 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | 790 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
791 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | 791 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, |
792 | 792 | ||
793 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
795 | 795 | ||
796 | /* | 796 | /* |
@@ -812,14 +812,14 @@ static struct page_state { | |||
812 | #undef slab | 812 | #undef slab |
813 | #undef reserved | 813 | #undef reserved |
814 | 814 | ||
815 | /* | ||
816 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | ||
817 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | ||
818 | */ | ||
815 | static void action_result(unsigned long pfn, char *msg, int result) | 819 | static void action_result(unsigned long pfn, char *msg, int result) |
816 | { | 820 | { |
817 | struct page *page = pfn_to_page(pfn); | 821 | pr_err("MCE %#lx: %s page recovery: %s\n", |
818 | 822 | pfn, msg, action_name[result]); | |
819 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | ||
820 | pfn, | ||
821 | PageDirty(page) ? "dirty " : "", | ||
822 | msg, action_name[result]); | ||
823 | } | 823 | } |
824 | 824 | ||
825 | static int page_action(struct page_state *ps, struct page *p, | 825 | static int page_action(struct page_state *ps, struct page *p, |
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1385 | * Isolate the page, so that it doesn't get reallocated if it | 1385 | * Isolate the page, so that it doesn't get reallocated if it |
1386 | * was free. | 1386 | * was free. |
1387 | */ | 1387 | */ |
1388 | set_migratetype_isolate(p); | 1388 | set_migratetype_isolate(p, true); |
1389 | /* | 1389 | /* |
1390 | * When the target page is a free hugepage, just remove it | 1390 | * When the target page is a free hugepage, just remove it |
1391 | * from free hugepage list. | 1391 | * from free hugepage list. |
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1566 | page_is_file_cache(page)); | 1566 | page_is_file_cache(page)); |
1567 | list_add(&page->lru, &pagelist); | 1567 | list_add(&page->lru, &pagelist); |
1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1569 | false, MIGRATE_SYNC); | 1569 | false, MIGRATE_SYNC, |
1570 | MR_MEMORY_FAILURE); | ||
1570 | if (ret) { | 1571 | if (ret) { |
1571 | putback_lru_pages(&pagelist); | 1572 | putback_lru_pages(&pagelist); |
1572 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1573 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index 221fc9ffcab1..e0a9b0ce4f10 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -57,6 +57,8 @@ | |||
57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | ||
61 | #include <linux/string.h> | ||
60 | 62 | ||
61 | #include <asm/io.h> | 63 | #include <asm/io.h> |
62 | #include <asm/pgalloc.h> | 64 | #include <asm/pgalloc.h> |
@@ -717,20 +719,6 @@ static inline bool is_cow_mapping(vm_flags_t flags) | |||
717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 719 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
718 | } | 720 | } |
719 | 721 | ||
720 | #ifndef is_zero_pfn | ||
721 | static inline int is_zero_pfn(unsigned long pfn) | ||
722 | { | ||
723 | return pfn == zero_pfn; | ||
724 | } | ||
725 | #endif | ||
726 | |||
727 | #ifndef my_zero_pfn | ||
728 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
729 | { | ||
730 | return zero_pfn; | ||
731 | } | ||
732 | #endif | ||
733 | |||
734 | /* | 722 | /* |
735 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 723 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
736 | * | 724 | * |
@@ -1250,7 +1238,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1250 | BUG(); | 1238 | BUG(); |
1251 | } | 1239 | } |
1252 | #endif | 1240 | #endif |
1253 | split_huge_page_pmd(vma->vm_mm, pmd); | 1241 | split_huge_page_pmd(vma, addr, pmd); |
1254 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1242 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1255 | goto next; | 1243 | goto next; |
1256 | /* fall through */ | 1244 | /* fall through */ |
@@ -1517,9 +1505,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1517 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1505 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1518 | goto out; | 1506 | goto out; |
1519 | } | 1507 | } |
1508 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
1509 | goto no_page_table; | ||
1520 | if (pmd_trans_huge(*pmd)) { | 1510 | if (pmd_trans_huge(*pmd)) { |
1521 | if (flags & FOLL_SPLIT) { | 1511 | if (flags & FOLL_SPLIT) { |
1522 | split_huge_page_pmd(mm, pmd); | 1512 | split_huge_page_pmd(vma, address, pmd); |
1523 | goto split_fallthrough; | 1513 | goto split_fallthrough; |
1524 | } | 1514 | } |
1525 | spin_lock(&mm->page_table_lock); | 1515 | spin_lock(&mm->page_table_lock); |
@@ -1546,6 +1536,8 @@ split_fallthrough: | |||
1546 | pte = *ptep; | 1536 | pte = *ptep; |
1547 | if (!pte_present(pte)) | 1537 | if (!pte_present(pte)) |
1548 | goto no_page; | 1538 | goto no_page; |
1539 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
1540 | goto no_page; | ||
1549 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1541 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
1550 | goto unlock; | 1542 | goto unlock; |
1551 | 1543 | ||
@@ -1697,6 +1689,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1697 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 1689 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
1698 | vm_flags &= (gup_flags & FOLL_FORCE) ? | 1690 | vm_flags &= (gup_flags & FOLL_FORCE) ? |
1699 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 1691 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
1692 | |||
1693 | /* | ||
1694 | * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault | ||
1695 | * would be called on PROT_NONE ranges. We must never invoke | ||
1696 | * handle_mm_fault on PROT_NONE ranges or the NUMA hinting | ||
1697 | * page faults would unprotect the PROT_NONE ranges if | ||
1698 | * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd | ||
1699 | * bitflag. So to avoid that, don't set FOLL_NUMA if | ||
1700 | * FOLL_FORCE is set. | ||
1701 | */ | ||
1702 | if (!(gup_flags & FOLL_FORCE)) | ||
1703 | gup_flags |= FOLL_NUMA; | ||
1704 | |||
1700 | i = 0; | 1705 | i = 0; |
1701 | 1706 | ||
1702 | do { | 1707 | do { |
@@ -2794,13 +2799,8 @@ unlock: | |||
2794 | oom_free_new: | 2799 | oom_free_new: |
2795 | page_cache_release(new_page); | 2800 | page_cache_release(new_page); |
2796 | oom: | 2801 | oom: |
2797 | if (old_page) { | 2802 | if (old_page) |
2798 | if (page_mkwrite) { | ||
2799 | unlock_page(old_page); | ||
2800 | page_cache_release(old_page); | ||
2801 | } | ||
2802 | page_cache_release(old_page); | 2803 | page_cache_release(old_page); |
2803 | } | ||
2804 | return VM_FAULT_OOM; | 2804 | return VM_FAULT_OOM; |
2805 | 2805 | ||
2806 | unwritable_page: | 2806 | unwritable_page: |
@@ -3431,6 +3431,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3431 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3431 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3432 | } | 3432 | } |
3433 | 3433 | ||
3434 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | ||
3435 | unsigned long addr, int current_nid) | ||
3436 | { | ||
3437 | get_page(page); | ||
3438 | |||
3439 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
3440 | if (current_nid == numa_node_id()) | ||
3441 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
3442 | |||
3443 | return mpol_misplaced(page, vma, addr); | ||
3444 | } | ||
3445 | |||
3446 | int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3447 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3448 | { | ||
3449 | struct page *page = NULL; | ||
3450 | spinlock_t *ptl; | ||
3451 | int current_nid = -1; | ||
3452 | int target_nid; | ||
3453 | bool migrated = false; | ||
3454 | |||
3455 | /* | ||
3456 | * The "pte" at this point cannot be used safely without | ||
3457 | * validation through pte_unmap_same(). It's of NUMA type but | ||
3458 | * the pfn may be screwed if the read is non atomic. | ||
3459 | * | ||
3460 | * ptep_modify_prot_start is not called as this is clearing | ||
3461 | * the _PAGE_NUMA bit and it is not really expected that there | ||
3462 | * would be concurrent hardware modifications to the PTE. | ||
3463 | */ | ||
3464 | ptl = pte_lockptr(mm, pmd); | ||
3465 | spin_lock(ptl); | ||
3466 | if (unlikely(!pte_same(*ptep, pte))) { | ||
3467 | pte_unmap_unlock(ptep, ptl); | ||
3468 | goto out; | ||
3469 | } | ||
3470 | |||
3471 | pte = pte_mknonnuma(pte); | ||
3472 | set_pte_at(mm, addr, ptep, pte); | ||
3473 | update_mmu_cache(vma, addr, ptep); | ||
3474 | |||
3475 | page = vm_normal_page(vma, addr, pte); | ||
3476 | if (!page) { | ||
3477 | pte_unmap_unlock(ptep, ptl); | ||
3478 | return 0; | ||
3479 | } | ||
3480 | |||
3481 | current_nid = page_to_nid(page); | ||
3482 | target_nid = numa_migrate_prep(page, vma, addr, current_nid); | ||
3483 | pte_unmap_unlock(ptep, ptl); | ||
3484 | if (target_nid == -1) { | ||
3485 | /* | ||
3486 | * Account for the fault against the current node if it not | ||
3487 | * being replaced regardless of where the page is located. | ||
3488 | */ | ||
3489 | current_nid = numa_node_id(); | ||
3490 | put_page(page); | ||
3491 | goto out; | ||
3492 | } | ||
3493 | |||
3494 | /* Migrate to the requested node */ | ||
3495 | migrated = migrate_misplaced_page(page, target_nid); | ||
3496 | if (migrated) | ||
3497 | current_nid = target_nid; | ||
3498 | |||
3499 | out: | ||
3500 | if (current_nid != -1) | ||
3501 | task_numa_fault(current_nid, 1, migrated); | ||
3502 | return 0; | ||
3503 | } | ||
3504 | |||
3505 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3506 | #ifdef CONFIG_NUMA_BALANCING | ||
3507 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3508 | unsigned long addr, pmd_t *pmdp) | ||
3509 | { | ||
3510 | pmd_t pmd; | ||
3511 | pte_t *pte, *orig_pte; | ||
3512 | unsigned long _addr = addr & PMD_MASK; | ||
3513 | unsigned long offset; | ||
3514 | spinlock_t *ptl; | ||
3515 | bool numa = false; | ||
3516 | int local_nid = numa_node_id(); | ||
3517 | |||
3518 | spin_lock(&mm->page_table_lock); | ||
3519 | pmd = *pmdp; | ||
3520 | if (pmd_numa(pmd)) { | ||
3521 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3522 | numa = true; | ||
3523 | } | ||
3524 | spin_unlock(&mm->page_table_lock); | ||
3525 | |||
3526 | if (!numa) | ||
3527 | return 0; | ||
3528 | |||
3529 | /* we're in a page fault so some vma must be in the range */ | ||
3530 | BUG_ON(!vma); | ||
3531 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3532 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3533 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3534 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3535 | pte += offset >> PAGE_SHIFT; | ||
3536 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3537 | pte_t pteval = *pte; | ||
3538 | struct page *page; | ||
3539 | int curr_nid = local_nid; | ||
3540 | int target_nid; | ||
3541 | bool migrated; | ||
3542 | if (!pte_present(pteval)) | ||
3543 | continue; | ||
3544 | if (!pte_numa(pteval)) | ||
3545 | continue; | ||
3546 | if (addr >= vma->vm_end) { | ||
3547 | vma = find_vma(mm, addr); | ||
3548 | /* there's a pte present so there must be a vma */ | ||
3549 | BUG_ON(!vma); | ||
3550 | BUG_ON(addr < vma->vm_start); | ||
3551 | } | ||
3552 | if (pte_numa(pteval)) { | ||
3553 | pteval = pte_mknonnuma(pteval); | ||
3554 | set_pte_at(mm, addr, pte, pteval); | ||
3555 | } | ||
3556 | page = vm_normal_page(vma, addr, pteval); | ||
3557 | if (unlikely(!page)) | ||
3558 | continue; | ||
3559 | /* only check non-shared pages */ | ||
3560 | if (unlikely(page_mapcount(page) != 1)) | ||
3561 | continue; | ||
3562 | |||
3563 | /* | ||
3564 | * Note that the NUMA fault is later accounted to either | ||
3565 | * the node that is currently running or where the page is | ||
3566 | * migrated to. | ||
3567 | */ | ||
3568 | curr_nid = local_nid; | ||
3569 | target_nid = numa_migrate_prep(page, vma, addr, | ||
3570 | page_to_nid(page)); | ||
3571 | if (target_nid == -1) { | ||
3572 | put_page(page); | ||
3573 | continue; | ||
3574 | } | ||
3575 | |||
3576 | /* Migrate to the requested node */ | ||
3577 | pte_unmap_unlock(pte, ptl); | ||
3578 | migrated = migrate_misplaced_page(page, target_nid); | ||
3579 | if (migrated) | ||
3580 | curr_nid = target_nid; | ||
3581 | task_numa_fault(curr_nid, 1, migrated); | ||
3582 | |||
3583 | pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
3584 | } | ||
3585 | pte_unmap_unlock(orig_pte, ptl); | ||
3586 | |||
3587 | return 0; | ||
3588 | } | ||
3589 | #else | ||
3590 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3591 | unsigned long addr, pmd_t *pmdp) | ||
3592 | { | ||
3593 | BUG(); | ||
3594 | return 0; | ||
3595 | } | ||
3596 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3597 | |||
3434 | /* | 3598 | /* |
3435 | * These routines also need to handle stuff like marking pages dirty | 3599 | * These routines also need to handle stuff like marking pages dirty |
3436 | * and/or accessed for architectures that don't do it in hardware (most | 3600 | * and/or accessed for architectures that don't do it in hardware (most |
@@ -3469,6 +3633,9 @@ int handle_pte_fault(struct mm_struct *mm, | |||
3469 | pte, pmd, flags, entry); | 3633 | pte, pmd, flags, entry); |
3470 | } | 3634 | } |
3471 | 3635 | ||
3636 | if (pte_numa(entry)) | ||
3637 | return do_numa_page(mm, vma, address, entry, pte, pmd); | ||
3638 | |||
3472 | ptl = pte_lockptr(mm, pmd); | 3639 | ptl = pte_lockptr(mm, pmd); |
3473 | spin_lock(ptl); | 3640 | spin_lock(ptl); |
3474 | if (unlikely(!pte_same(*pte, entry))) | 3641 | if (unlikely(!pte_same(*pte, entry))) |
@@ -3537,9 +3704,13 @@ retry: | |||
3537 | 3704 | ||
3538 | barrier(); | 3705 | barrier(); |
3539 | if (pmd_trans_huge(orig_pmd)) { | 3706 | if (pmd_trans_huge(orig_pmd)) { |
3540 | if (flags & FAULT_FLAG_WRITE && | 3707 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3541 | !pmd_write(orig_pmd) && | 3708 | |
3542 | !pmd_trans_splitting(orig_pmd)) { | 3709 | if (pmd_numa(orig_pmd)) |
3710 | return do_huge_pmd_numa_page(mm, vma, address, | ||
3711 | orig_pmd, pmd); | ||
3712 | |||
3713 | if (dirty && !pmd_write(orig_pmd)) { | ||
3543 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3714 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3544 | orig_pmd); | 3715 | orig_pmd); |
3545 | /* | 3716 | /* |
@@ -3550,17 +3721,25 @@ retry: | |||
3550 | if (unlikely(ret & VM_FAULT_OOM)) | 3721 | if (unlikely(ret & VM_FAULT_OOM)) |
3551 | goto retry; | 3722 | goto retry; |
3552 | return ret; | 3723 | return ret; |
3724 | } else { | ||
3725 | huge_pmd_set_accessed(mm, vma, address, pmd, | ||
3726 | orig_pmd, dirty); | ||
3553 | } | 3727 | } |
3728 | |||
3554 | return 0; | 3729 | return 0; |
3555 | } | 3730 | } |
3556 | } | 3731 | } |
3557 | 3732 | ||
3733 | if (pmd_numa(*pmd)) | ||
3734 | return do_pmd_numa_page(mm, vma, address, pmd); | ||
3735 | |||
3558 | /* | 3736 | /* |
3559 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3737 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3560 | * run pte_offset_map on the pmd, if an huge pmd could | 3738 | * run pte_offset_map on the pmd, if an huge pmd could |
3561 | * materialize from under us from a different thread. | 3739 | * materialize from under us from a different thread. |
3562 | */ | 3740 | */ |
3563 | if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) | 3741 | if (unlikely(pmd_none(*pmd)) && |
3742 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3564 | return VM_FAULT_OOM; | 3743 | return VM_FAULT_OOM; |
3565 | /* if an huge pmd materialized from under us just retry later */ | 3744 | /* if an huge pmd materialized from under us just retry later */ |
3566 | if (unlikely(pmd_trans_huge(*pmd))) | 3745 | if (unlikely(pmd_trans_huge(*pmd))) |
@@ -3940,15 +4119,12 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3940 | struct file *f = vma->vm_file; | 4119 | struct file *f = vma->vm_file; |
3941 | char *buf = (char *)__get_free_page(GFP_KERNEL); | 4120 | char *buf = (char *)__get_free_page(GFP_KERNEL); |
3942 | if (buf) { | 4121 | if (buf) { |
3943 | char *p, *s; | 4122 | char *p; |
3944 | 4123 | ||
3945 | p = d_path(&f->f_path, buf, PAGE_SIZE); | 4124 | p = d_path(&f->f_path, buf, PAGE_SIZE); |
3946 | if (IS_ERR(p)) | 4125 | if (IS_ERR(p)) |
3947 | p = "?"; | 4126 | p = "?"; |
3948 | s = strrchr(p, '/'); | 4127 | printk("%s%s[%lx+%lx]", prefix, kbasename(p), |
3949 | if (s) | ||
3950 | p = s+1; | ||
3951 | printk("%s%s[%lx+%lx]", prefix, p, | ||
3952 | vma->vm_start, | 4128 | vma->vm_start, |
3953 | vma->vm_end - vma->vm_start); | 4129 | vma->vm_end - vma->vm_start); |
3954 | free_page((unsigned long)buf); | 4130 | free_page((unsigned long)buf); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4eeacae2b91..d04ed87bfacb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, | |||
106 | void __ref put_page_bootmem(struct page *page) | 106 | void __ref put_page_bootmem(struct page *page) |
107 | { | 107 | { |
108 | unsigned long type; | 108 | unsigned long type; |
109 | static DEFINE_MUTEX(ppb_lock); | ||
109 | 110 | ||
110 | type = (unsigned long) page->lru.next; | 111 | type = (unsigned long) page->lru.next; |
111 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page) | |||
115 | ClearPagePrivate(page); | 116 | ClearPagePrivate(page); |
116 | set_page_private(page, 0); | 117 | set_page_private(page, 0); |
117 | INIT_LIST_HEAD(&page->lru); | 118 | INIT_LIST_HEAD(&page->lru); |
119 | |||
120 | /* | ||
121 | * Please refer to comment for __free_pages_bootmem() | ||
122 | * for why we serialize here. | ||
123 | */ | ||
124 | mutex_lock(&ppb_lock); | ||
118 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
126 | mutex_unlock(&ppb_lock); | ||
119 | } | 127 | } |
120 | 128 | ||
121 | } | 129 | } |
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
205 | zone_span_writelock(zone); | 213 | zone_span_writelock(zone); |
206 | 214 | ||
207 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 215 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; |
208 | if (start_pfn < zone->zone_start_pfn) | 216 | if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) |
209 | zone->zone_start_pfn = start_pfn; | 217 | zone->zone_start_pfn = start_pfn; |
210 | 218 | ||
211 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - | 219 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - |
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
214 | zone_span_writeunlock(zone); | 222 | zone_span_writeunlock(zone); |
215 | } | 223 | } |
216 | 224 | ||
225 | static void resize_zone(struct zone *zone, unsigned long start_pfn, | ||
226 | unsigned long end_pfn) | ||
227 | { | ||
228 | zone_span_writelock(zone); | ||
229 | |||
230 | if (end_pfn - start_pfn) { | ||
231 | zone->zone_start_pfn = start_pfn; | ||
232 | zone->spanned_pages = end_pfn - start_pfn; | ||
233 | } else { | ||
234 | /* | ||
235 | * make it consist as free_area_init_core(), | ||
236 | * if spanned_pages = 0, then keep start_pfn = 0 | ||
237 | */ | ||
238 | zone->zone_start_pfn = 0; | ||
239 | zone->spanned_pages = 0; | ||
240 | } | ||
241 | |||
242 | zone_span_writeunlock(zone); | ||
243 | } | ||
244 | |||
245 | static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | ||
246 | unsigned long end_pfn) | ||
247 | { | ||
248 | enum zone_type zid = zone_idx(zone); | ||
249 | int nid = zone->zone_pgdat->node_id; | ||
250 | unsigned long pfn; | ||
251 | |||
252 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||
253 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); | ||
254 | } | ||
255 | |||
256 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | ||
257 | unsigned long start_pfn, unsigned long end_pfn) | ||
258 | { | ||
259 | int ret; | ||
260 | unsigned long flags; | ||
261 | unsigned long z1_start_pfn; | ||
262 | |||
263 | if (!z1->wait_table) { | ||
264 | ret = init_currently_empty_zone(z1, start_pfn, | ||
265 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | ||
266 | if (ret) | ||
267 | return ret; | ||
268 | } | ||
269 | |||
270 | pgdat_resize_lock(z1->zone_pgdat, &flags); | ||
271 | |||
272 | /* can't move pfns which are higher than @z2 */ | ||
273 | if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | ||
274 | goto out_fail; | ||
275 | /* the move out part mast at the left most of @z2 */ | ||
276 | if (start_pfn > z2->zone_start_pfn) | ||
277 | goto out_fail; | ||
278 | /* must included/overlap */ | ||
279 | if (end_pfn <= z2->zone_start_pfn) | ||
280 | goto out_fail; | ||
281 | |||
282 | /* use start_pfn for z1's start_pfn if z1 is empty */ | ||
283 | if (z1->spanned_pages) | ||
284 | z1_start_pfn = z1->zone_start_pfn; | ||
285 | else | ||
286 | z1_start_pfn = start_pfn; | ||
287 | |||
288 | resize_zone(z1, z1_start_pfn, end_pfn); | ||
289 | resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | ||
290 | |||
291 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
292 | |||
293 | fix_zone_id(z1, start_pfn, end_pfn); | ||
294 | |||
295 | return 0; | ||
296 | out_fail: | ||
297 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
298 | return -1; | ||
299 | } | ||
300 | |||
301 | static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | ||
302 | unsigned long start_pfn, unsigned long end_pfn) | ||
303 | { | ||
304 | int ret; | ||
305 | unsigned long flags; | ||
306 | unsigned long z2_end_pfn; | ||
307 | |||
308 | if (!z2->wait_table) { | ||
309 | ret = init_currently_empty_zone(z2, start_pfn, | ||
310 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | ||
311 | if (ret) | ||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | pgdat_resize_lock(z1->zone_pgdat, &flags); | ||
316 | |||
317 | /* can't move pfns which are lower than @z1 */ | ||
318 | if (z1->zone_start_pfn > start_pfn) | ||
319 | goto out_fail; | ||
320 | /* the move out part mast at the right most of @z1 */ | ||
321 | if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | ||
322 | goto out_fail; | ||
323 | /* must included/overlap */ | ||
324 | if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | ||
325 | goto out_fail; | ||
326 | |||
327 | /* use end_pfn for z2's end_pfn if z2 is empty */ | ||
328 | if (z2->spanned_pages) | ||
329 | z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; | ||
330 | else | ||
331 | z2_end_pfn = end_pfn; | ||
332 | |||
333 | resize_zone(z1, z1->zone_start_pfn, start_pfn); | ||
334 | resize_zone(z2, start_pfn, z2_end_pfn); | ||
335 | |||
336 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
337 | |||
338 | fix_zone_id(z2, start_pfn, end_pfn); | ||
339 | |||
340 | return 0; | ||
341 | out_fail: | ||
342 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | ||
343 | return -1; | ||
344 | } | ||
345 | |||
217 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | 346 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
218 | unsigned long end_pfn) | 347 | unsigned long end_pfn) |
219 | { | 348 | { |
220 | unsigned long old_pgdat_end_pfn = | 349 | unsigned long old_pgdat_end_pfn = |
221 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | 350 | pgdat->node_start_pfn + pgdat->node_spanned_pages; |
222 | 351 | ||
223 | if (start_pfn < pgdat->node_start_pfn) | 352 | if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) |
224 | pgdat->node_start_pfn = start_pfn; | 353 | pgdat->node_start_pfn = start_pfn; |
225 | 354 | ||
226 | pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - | 355 | pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - |
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
460 | return 0; | 589 | return 0; |
461 | } | 590 | } |
462 | 591 | ||
592 | #ifdef CONFIG_MOVABLE_NODE | ||
593 | /* | ||
594 | * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have | ||
595 | * normal memory. | ||
596 | */ | ||
597 | static bool can_online_high_movable(struct zone *zone) | ||
598 | { | ||
599 | return true; | ||
600 | } | ||
601 | #else /* CONFIG_MOVABLE_NODE */ | ||
602 | /* ensure every online node has NORMAL memory */ | ||
603 | static bool can_online_high_movable(struct zone *zone) | ||
604 | { | ||
605 | return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); | ||
606 | } | ||
607 | #endif /* CONFIG_MOVABLE_NODE */ | ||
463 | 608 | ||
464 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | 609 | /* check which state of node_states will be changed when online memory */ |
610 | static void node_states_check_changes_online(unsigned long nr_pages, | ||
611 | struct zone *zone, struct memory_notify *arg) | ||
612 | { | ||
613 | int nid = zone_to_nid(zone); | ||
614 | enum zone_type zone_last = ZONE_NORMAL; | ||
615 | |||
616 | /* | ||
617 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] | ||
618 | * contains nodes which have zones of 0...ZONE_NORMAL, | ||
619 | * set zone_last to ZONE_NORMAL. | ||
620 | * | ||
621 | * If we don't have HIGHMEM nor movable node, | ||
622 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of | ||
623 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
624 | */ | ||
625 | if (N_MEMORY == N_NORMAL_MEMORY) | ||
626 | zone_last = ZONE_MOVABLE; | ||
627 | |||
628 | /* | ||
629 | * if the memory to be online is in a zone of 0...zone_last, and | ||
630 | * the zones of 0...zone_last don't have memory before online, we will | ||
631 | * need to set the node to node_states[N_NORMAL_MEMORY] after | ||
632 | * the memory is online. | ||
633 | */ | ||
634 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) | ||
635 | arg->status_change_nid_normal = nid; | ||
636 | else | ||
637 | arg->status_change_nid_normal = -1; | ||
638 | |||
639 | #ifdef CONFIG_HIGHMEM | ||
640 | /* | ||
641 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
642 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
643 | * set zone_last to ZONE_HIGHMEM. | ||
644 | * | ||
645 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
646 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
647 | * set zone_last to ZONE_MOVABLE. | ||
648 | */ | ||
649 | zone_last = ZONE_HIGHMEM; | ||
650 | if (N_MEMORY == N_HIGH_MEMORY) | ||
651 | zone_last = ZONE_MOVABLE; | ||
652 | |||
653 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) | ||
654 | arg->status_change_nid_high = nid; | ||
655 | else | ||
656 | arg->status_change_nid_high = -1; | ||
657 | #else | ||
658 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
659 | #endif | ||
660 | |||
661 | /* | ||
662 | * if the node don't have memory befor online, we will need to | ||
663 | * set the node to node_states[N_MEMORY] after the memory | ||
664 | * is online. | ||
665 | */ | ||
666 | if (!node_state(nid, N_MEMORY)) | ||
667 | arg->status_change_nid = nid; | ||
668 | else | ||
669 | arg->status_change_nid = -1; | ||
670 | } | ||
671 | |||
672 | static void node_states_set_node(int node, struct memory_notify *arg) | ||
673 | { | ||
674 | if (arg->status_change_nid_normal >= 0) | ||
675 | node_set_state(node, N_NORMAL_MEMORY); | ||
676 | |||
677 | if (arg->status_change_nid_high >= 0) | ||
678 | node_set_state(node, N_HIGH_MEMORY); | ||
679 | |||
680 | node_set_state(node, N_MEMORY); | ||
681 | } | ||
682 | |||
683 | |||
684 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | ||
465 | { | 685 | { |
466 | unsigned long onlined_pages = 0; | 686 | unsigned long onlined_pages = 0; |
467 | struct zone *zone; | 687 | struct zone *zone; |
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
471 | struct memory_notify arg; | 691 | struct memory_notify arg; |
472 | 692 | ||
473 | lock_memory_hotplug(); | 693 | lock_memory_hotplug(); |
694 | /* | ||
695 | * This doesn't need a lock to do pfn_to_page(). | ||
696 | * The section can't be removed here because of the | ||
697 | * memory_block->state_mutex. | ||
698 | */ | ||
699 | zone = page_zone(pfn_to_page(pfn)); | ||
700 | |||
701 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && | ||
702 | !can_online_high_movable(zone)) { | ||
703 | unlock_memory_hotplug(); | ||
704 | return -1; | ||
705 | } | ||
706 | |||
707 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | ||
708 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | ||
709 | unlock_memory_hotplug(); | ||
710 | return -1; | ||
711 | } | ||
712 | } | ||
713 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | ||
714 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | ||
715 | unlock_memory_hotplug(); | ||
716 | return -1; | ||
717 | } | ||
718 | } | ||
719 | |||
720 | /* Previous code may changed the zone of the pfn range */ | ||
721 | zone = page_zone(pfn_to_page(pfn)); | ||
722 | |||
474 | arg.start_pfn = pfn; | 723 | arg.start_pfn = pfn; |
475 | arg.nr_pages = nr_pages; | 724 | arg.nr_pages = nr_pages; |
476 | arg.status_change_nid = -1; | 725 | node_states_check_changes_online(nr_pages, zone, &arg); |
477 | 726 | ||
478 | nid = page_to_nid(pfn_to_page(pfn)); | 727 | nid = page_to_nid(pfn_to_page(pfn)); |
479 | if (node_present_pages(nid) == 0) | ||
480 | arg.status_change_nid = nid; | ||
481 | 728 | ||
482 | ret = memory_notify(MEM_GOING_ONLINE, &arg); | 729 | ret = memory_notify(MEM_GOING_ONLINE, &arg); |
483 | ret = notifier_to_errno(ret); | 730 | ret = notifier_to_errno(ret); |
@@ -487,23 +734,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
487 | return ret; | 734 | return ret; |
488 | } | 735 | } |
489 | /* | 736 | /* |
490 | * This doesn't need a lock to do pfn_to_page(). | ||
491 | * The section can't be removed here because of the | ||
492 | * memory_block->state_mutex. | ||
493 | */ | ||
494 | zone = page_zone(pfn_to_page(pfn)); | ||
495 | /* | ||
496 | * If this zone is not populated, then it is not in zonelist. | 737 | * If this zone is not populated, then it is not in zonelist. |
497 | * This means the page allocator ignores this zone. | 738 | * This means the page allocator ignores this zone. |
498 | * So, zonelist must be updated after online. | 739 | * So, zonelist must be updated after online. |
499 | */ | 740 | */ |
500 | mutex_lock(&zonelists_mutex); | 741 | mutex_lock(&zonelists_mutex); |
501 | if (!populated_zone(zone)) | 742 | if (!populated_zone(zone)) { |
502 | need_zonelists_rebuild = 1; | 743 | need_zonelists_rebuild = 1; |
744 | build_all_zonelists(NULL, zone); | ||
745 | } | ||
503 | 746 | ||
504 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 747 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
505 | online_pages_range); | 748 | online_pages_range); |
506 | if (ret) { | 749 | if (ret) { |
750 | if (need_zonelists_rebuild) | ||
751 | zone_pcp_reset(zone); | ||
507 | mutex_unlock(&zonelists_mutex); | 752 | mutex_unlock(&zonelists_mutex); |
508 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", | 753 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", |
509 | (unsigned long long) pfn << PAGE_SHIFT, | 754 | (unsigned long long) pfn << PAGE_SHIFT, |
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
514 | return ret; | 759 | return ret; |
515 | } | 760 | } |
516 | 761 | ||
762 | zone->managed_pages += onlined_pages; | ||
517 | zone->present_pages += onlined_pages; | 763 | zone->present_pages += onlined_pages; |
518 | zone->zone_pgdat->node_present_pages += onlined_pages; | 764 | zone->zone_pgdat->node_present_pages += onlined_pages; |
519 | if (onlined_pages) { | 765 | if (onlined_pages) { |
520 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 766 | node_states_set_node(zone_to_nid(zone), &arg); |
521 | if (need_zonelists_rebuild) | 767 | if (need_zonelists_rebuild) |
522 | build_all_zonelists(NULL, zone); | 768 | build_all_zonelists(NULL, NULL); |
523 | else | 769 | else |
524 | zone_pcp_update(zone); | 770 | zone_pcp_update(zone); |
525 | } | 771 | } |
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
812 | * migrate_pages returns # of failed pages. | 1058 | * migrate_pages returns # of failed pages. |
813 | */ | 1059 | */ |
814 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1060 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
815 | true, MIGRATE_SYNC); | 1061 | true, MIGRATE_SYNC, |
1062 | MR_MEMORY_HOTPLUG); | ||
816 | if (ret) | 1063 | if (ret) |
817 | putback_lru_pages(&source); | 1064 | putback_lru_pages(&source); |
818 | } | 1065 | } |
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, | |||
847 | { | 1094 | { |
848 | int ret; | 1095 | int ret; |
849 | long offlined = *(long *)data; | 1096 | long offlined = *(long *)data; |
850 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); | 1097 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); |
851 | offlined = nr_pages; | 1098 | offlined = nr_pages; |
852 | if (!ret) | 1099 | if (!ret) |
853 | *(long *)data += offlined; | 1100 | *(long *)data += offlined; |
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
867 | return offlined; | 1114 | return offlined; |
868 | } | 1115 | } |
869 | 1116 | ||
1117 | #ifdef CONFIG_MOVABLE_NODE | ||
1118 | /* | ||
1119 | * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have | ||
1120 | * normal memory. | ||
1121 | */ | ||
1122 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | ||
1123 | { | ||
1124 | return true; | ||
1125 | } | ||
1126 | #else /* CONFIG_MOVABLE_NODE */ | ||
1127 | /* ensure the node has NORMAL memory if it is still online */ | ||
1128 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | ||
1129 | { | ||
1130 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1131 | unsigned long present_pages = 0; | ||
1132 | enum zone_type zt; | ||
1133 | |||
1134 | for (zt = 0; zt <= ZONE_NORMAL; zt++) | ||
1135 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1136 | |||
1137 | if (present_pages > nr_pages) | ||
1138 | return true; | ||
1139 | |||
1140 | present_pages = 0; | ||
1141 | for (; zt <= ZONE_MOVABLE; zt++) | ||
1142 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1143 | |||
1144 | /* | ||
1145 | * we can't offline the last normal memory until all | ||
1146 | * higher memory is offlined. | ||
1147 | */ | ||
1148 | return present_pages == 0; | ||
1149 | } | ||
1150 | #endif /* CONFIG_MOVABLE_NODE */ | ||
1151 | |||
1152 | /* check which state of node_states will be changed when offline memory */ | ||
1153 | static void node_states_check_changes_offline(unsigned long nr_pages, | ||
1154 | struct zone *zone, struct memory_notify *arg) | ||
1155 | { | ||
1156 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
1157 | unsigned long present_pages = 0; | ||
1158 | enum zone_type zt, zone_last = ZONE_NORMAL; | ||
1159 | |||
1160 | /* | ||
1161 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] | ||
1162 | * contains nodes which have zones of 0...ZONE_NORMAL, | ||
1163 | * set zone_last to ZONE_NORMAL. | ||
1164 | * | ||
1165 | * If we don't have HIGHMEM nor movable node, | ||
1166 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of | ||
1167 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
1168 | */ | ||
1169 | if (N_MEMORY == N_NORMAL_MEMORY) | ||
1170 | zone_last = ZONE_MOVABLE; | ||
1171 | |||
1172 | /* | ||
1173 | * check whether node_states[N_NORMAL_MEMORY] will be changed. | ||
1174 | * If the memory to be offline is in a zone of 0...zone_last, | ||
1175 | * and it is the last present memory, 0...zone_last will | ||
1176 | * become empty after offline , thus we can determind we will | ||
1177 | * need to clear the node from node_states[N_NORMAL_MEMORY]. | ||
1178 | */ | ||
1179 | for (zt = 0; zt <= zone_last; zt++) | ||
1180 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1181 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
1182 | arg->status_change_nid_normal = zone_to_nid(zone); | ||
1183 | else | ||
1184 | arg->status_change_nid_normal = -1; | ||
1185 | |||
1186 | #ifdef CONFIG_HIGHMEM | ||
1187 | /* | ||
1188 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
1189 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
1190 | * set zone_last to ZONE_HIGHMEM. | ||
1191 | * | ||
1192 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
1193 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
1194 | * set zone_last to ZONE_MOVABLE. | ||
1195 | */ | ||
1196 | zone_last = ZONE_HIGHMEM; | ||
1197 | if (N_MEMORY == N_HIGH_MEMORY) | ||
1198 | zone_last = ZONE_MOVABLE; | ||
1199 | |||
1200 | for (; zt <= zone_last; zt++) | ||
1201 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1202 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
1203 | arg->status_change_nid_high = zone_to_nid(zone); | ||
1204 | else | ||
1205 | arg->status_change_nid_high = -1; | ||
1206 | #else | ||
1207 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
1208 | #endif | ||
1209 | |||
1210 | /* | ||
1211 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE | ||
1212 | */ | ||
1213 | zone_last = ZONE_MOVABLE; | ||
1214 | |||
1215 | /* | ||
1216 | * check whether node_states[N_HIGH_MEMORY] will be changed | ||
1217 | * If we try to offline the last present @nr_pages from the node, | ||
1218 | * we can determind we will need to clear the node from | ||
1219 | * node_states[N_HIGH_MEMORY]. | ||
1220 | */ | ||
1221 | for (; zt <= zone_last; zt++) | ||
1222 | present_pages += pgdat->node_zones[zt].present_pages; | ||
1223 | if (nr_pages >= present_pages) | ||
1224 | arg->status_change_nid = zone_to_nid(zone); | ||
1225 | else | ||
1226 | arg->status_change_nid = -1; | ||
1227 | } | ||
1228 | |||
1229 | static void node_states_clear_node(int node, struct memory_notify *arg) | ||
1230 | { | ||
1231 | if (arg->status_change_nid_normal >= 0) | ||
1232 | node_clear_state(node, N_NORMAL_MEMORY); | ||
1233 | |||
1234 | if ((N_MEMORY != N_NORMAL_MEMORY) && | ||
1235 | (arg->status_change_nid_high >= 0)) | ||
1236 | node_clear_state(node, N_HIGH_MEMORY); | ||
1237 | |||
1238 | if ((N_MEMORY != N_HIGH_MEMORY) && | ||
1239 | (arg->status_change_nid >= 0)) | ||
1240 | node_clear_state(node, N_MEMORY); | ||
1241 | } | ||
1242 | |||
870 | static int __ref __offline_pages(unsigned long start_pfn, | 1243 | static int __ref __offline_pages(unsigned long start_pfn, |
871 | unsigned long end_pfn, unsigned long timeout) | 1244 | unsigned long end_pfn, unsigned long timeout) |
872 | { | 1245 | { |
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
893 | node = zone_to_nid(zone); | 1266 | node = zone_to_nid(zone); |
894 | nr_pages = end_pfn - start_pfn; | 1267 | nr_pages = end_pfn - start_pfn; |
895 | 1268 | ||
1269 | ret = -EINVAL; | ||
1270 | if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) | ||
1271 | goto out; | ||
1272 | |||
896 | /* set above range as isolated */ | 1273 | /* set above range as isolated */ |
897 | ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1274 | ret = start_isolate_page_range(start_pfn, end_pfn, |
1275 | MIGRATE_MOVABLE, true); | ||
898 | if (ret) | 1276 | if (ret) |
899 | goto out; | 1277 | goto out; |
900 | 1278 | ||
901 | arg.start_pfn = start_pfn; | 1279 | arg.start_pfn = start_pfn; |
902 | arg.nr_pages = nr_pages; | 1280 | arg.nr_pages = nr_pages; |
903 | arg.status_change_nid = -1; | 1281 | node_states_check_changes_offline(nr_pages, zone, &arg); |
904 | if (nr_pages >= node_present_pages(node)) | ||
905 | arg.status_change_nid = node; | ||
906 | 1282 | ||
907 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); | 1283 | ret = memory_notify(MEM_GOING_OFFLINE, &arg); |
908 | ret = notifier_to_errno(ret); | 1284 | ret = notifier_to_errno(ret); |
@@ -943,10 +1319,10 @@ repeat: | |||
943 | goto repeat; | 1319 | goto repeat; |
944 | } | 1320 | } |
945 | } | 1321 | } |
946 | /* drain all zone's lru pagevec, this is asyncronous... */ | 1322 | /* drain all zone's lru pagevec, this is asynchronous... */ |
947 | lru_add_drain_all(); | 1323 | lru_add_drain_all(); |
948 | yield(); | 1324 | yield(); |
949 | /* drain pcp pages , this is synchrouns. */ | 1325 | /* drain pcp pages, this is synchronous. */ |
950 | drain_all_pages(); | 1326 | drain_all_pages(); |
951 | /* check again */ | 1327 | /* check again */ |
952 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | 1328 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); |
@@ -955,12 +1331,13 @@ repeat: | |||
955 | goto failed_removal; | 1331 | goto failed_removal; |
956 | } | 1332 | } |
957 | printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); | 1333 | printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); |
958 | /* Ok, all of our target is islaoted. | 1334 | /* Ok, all of our target is isolated. |
959 | We cannot do rollback at this point. */ | 1335 | We cannot do rollback at this point. */ |
960 | offline_isolated_pages(start_pfn, end_pfn); | 1336 | offline_isolated_pages(start_pfn, end_pfn); |
961 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 1337 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
962 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1338 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
963 | /* removal success */ | 1339 | /* removal success */ |
1340 | zone->managed_pages -= offlined_pages; | ||
964 | zone->present_pages -= offlined_pages; | 1341 | zone->present_pages -= offlined_pages; |
965 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 1342 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
966 | totalram_pages -= offlined_pages; | 1343 | totalram_pages -= offlined_pages; |
@@ -975,10 +1352,9 @@ repeat: | |||
975 | } else | 1352 | } else |
976 | zone_pcp_update(zone); | 1353 | zone_pcp_update(zone); |
977 | 1354 | ||
978 | if (!node_present_pages(node)) { | 1355 | node_states_clear_node(node, &arg); |
979 | node_clear_state(node, N_HIGH_MEMORY); | 1356 | if (arg.status_change_nid >= 0) |
980 | kswapd_stop(node); | 1357 | kswapd_stop(node); |
981 | } | ||
982 | 1358 | ||
983 | vm_total_pages = nr_free_pagecache_pages(); | 1359 | vm_total_pages = nr_free_pagecache_pages(); |
984 | writeback_set_ratelimit(); | 1360 | writeback_set_ratelimit(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ea600da8940..d1b315e98627 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -90,6 +90,7 @@ | |||
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | 92 | #include <linux/mm_inline.h> |
93 | #include <linux/mmu_notifier.h> | ||
93 | 94 | ||
94 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = { | |||
117 | .flags = MPOL_F_LOCAL, | 118 | .flags = MPOL_F_LOCAL, |
118 | }; | 119 | }; |
119 | 120 | ||
121 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | ||
122 | |||
123 | static struct mempolicy *get_task_policy(struct task_struct *p) | ||
124 | { | ||
125 | struct mempolicy *pol = p->mempolicy; | ||
126 | int node; | ||
127 | |||
128 | if (!pol) { | ||
129 | node = numa_node_id(); | ||
130 | if (node != -1) | ||
131 | pol = &preferred_node_policy[node]; | ||
132 | |||
133 | /* preferred_node_policy is not initialised early in boot */ | ||
134 | if (!pol->mode) | ||
135 | pol = NULL; | ||
136 | } | ||
137 | |||
138 | return pol; | ||
139 | } | ||
140 | |||
120 | static const struct mempolicy_operations { | 141 | static const struct mempolicy_operations { |
121 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); | 142 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); |
122 | /* | 143 | /* |
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, | |||
212 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | 233 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ |
213 | if (pol == NULL) | 234 | if (pol == NULL) |
214 | return 0; | 235 | return 0; |
215 | /* Check N_HIGH_MEMORY */ | 236 | /* Check N_MEMORY */ |
216 | nodes_and(nsc->mask1, | 237 | nodes_and(nsc->mask1, |
217 | cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); | 238 | cpuset_current_mems_allowed, node_states[N_MEMORY]); |
218 | 239 | ||
219 | VM_BUG_ON(!nodes); | 240 | VM_BUG_ON(!nodes); |
220 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | 241 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) |
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
254 | if (mode == MPOL_DEFAULT) { | 275 | if (mode == MPOL_DEFAULT) { |
255 | if (nodes && !nodes_empty(*nodes)) | 276 | if (nodes && !nodes_empty(*nodes)) |
256 | return ERR_PTR(-EINVAL); | 277 | return ERR_PTR(-EINVAL); |
257 | return NULL; /* simply delete any existing policy */ | 278 | return NULL; |
258 | } | 279 | } |
259 | VM_BUG_ON(!nodes); | 280 | VM_BUG_ON(!nodes); |
260 | 281 | ||
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
269 | (flags & MPOL_F_RELATIVE_NODES))) | 290 | (flags & MPOL_F_RELATIVE_NODES))) |
270 | return ERR_PTR(-EINVAL); | 291 | return ERR_PTR(-EINVAL); |
271 | } | 292 | } |
293 | } else if (mode == MPOL_LOCAL) { | ||
294 | if (!nodes_empty(*nodes)) | ||
295 | return ERR_PTR(-EINVAL); | ||
296 | mode = MPOL_PREFERRED; | ||
272 | } else if (nodes_empty(*nodes)) | 297 | } else if (nodes_empty(*nodes)) |
273 | return ERR_PTR(-EINVAL); | 298 | return ERR_PTR(-EINVAL); |
274 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 299 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
511 | pmd = pmd_offset(pud, addr); | 536 | pmd = pmd_offset(pud, addr); |
512 | do { | 537 | do { |
513 | next = pmd_addr_end(addr, end); | 538 | next = pmd_addr_end(addr, end); |
514 | split_huge_page_pmd(vma->vm_mm, pmd); | 539 | split_huge_page_pmd(vma, addr, pmd); |
515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 540 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
516 | continue; | 541 | continue; |
517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 542 | if (check_pte_range(vma, pmd, addr, next, nodes, |
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
561 | return 0; | 586 | return 0; |
562 | } | 587 | } |
563 | 588 | ||
589 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
590 | /* | ||
591 | * This is used to mark a range of virtual addresses to be inaccessible. | ||
592 | * These are later cleared by a NUMA hinting fault. Depending on these | ||
593 | * faults, pages may be migrated for better NUMA placement. | ||
594 | * | ||
595 | * This is assuming that NUMA faults are handled using PROT_NONE. If | ||
596 | * an architecture makes a different choice, it will need further | ||
597 | * changes to the core. | ||
598 | */ | ||
599 | unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
600 | unsigned long addr, unsigned long end) | ||
601 | { | ||
602 | int nr_updated; | ||
603 | BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); | ||
604 | |||
605 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | ||
606 | if (nr_updated) | ||
607 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | ||
608 | |||
609 | return nr_updated; | ||
610 | } | ||
611 | #else | ||
612 | static unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
613 | unsigned long addr, unsigned long end) | ||
614 | { | ||
615 | return 0; | ||
616 | } | ||
617 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | ||
618 | |||
564 | /* | 619 | /* |
565 | * Check if all pages in a range are on a set of nodes. | 620 | * Check if all pages in a range are on a set of nodes. |
566 | * If pagelist != NULL then isolate pages from the LRU and | 621 | * If pagelist != NULL then isolate pages from the LRU and |
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
579 | return ERR_PTR(-EFAULT); | 634 | return ERR_PTR(-EFAULT); |
580 | prev = NULL; | 635 | prev = NULL; |
581 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 636 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
637 | unsigned long endvma = vma->vm_end; | ||
638 | |||
639 | if (endvma > end) | ||
640 | endvma = end; | ||
641 | if (vma->vm_start > start) | ||
642 | start = vma->vm_start; | ||
643 | |||
582 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 644 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
583 | if (!vma->vm_next && vma->vm_end < end) | 645 | if (!vma->vm_next && vma->vm_end < end) |
584 | return ERR_PTR(-EFAULT); | 646 | return ERR_PTR(-EFAULT); |
585 | if (prev && prev->vm_end < vma->vm_start) | 647 | if (prev && prev->vm_end < vma->vm_start) |
586 | return ERR_PTR(-EFAULT); | 648 | return ERR_PTR(-EFAULT); |
587 | } | 649 | } |
588 | if (!is_vm_hugetlb_page(vma) && | 650 | |
589 | ((flags & MPOL_MF_STRICT) || | 651 | if (is_vm_hugetlb_page(vma)) |
652 | goto next; | ||
653 | |||
654 | if (flags & MPOL_MF_LAZY) { | ||
655 | change_prot_numa(vma, start, endvma); | ||
656 | goto next; | ||
657 | } | ||
658 | |||
659 | if ((flags & MPOL_MF_STRICT) || | ||
590 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | 660 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && |
591 | vma_migratable(vma)))) { | 661 | vma_migratable(vma))) { |
592 | unsigned long endvma = vma->vm_end; | ||
593 | 662 | ||
594 | if (endvma > end) | ||
595 | endvma = end; | ||
596 | if (vma->vm_start > start) | ||
597 | start = vma->vm_start; | ||
598 | err = check_pgd_range(vma, start, endvma, nodes, | 663 | err = check_pgd_range(vma, start, endvma, nodes, |
599 | flags, private); | 664 | flags, private); |
600 | if (err) { | 665 | if (err) { |
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
602 | break; | 667 | break; |
603 | } | 668 | } |
604 | } | 669 | } |
670 | next: | ||
605 | prev = vma; | 671 | prev = vma; |
606 | } | 672 | } |
607 | return first; | 673 | return first; |
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
961 | 1027 | ||
962 | if (!list_empty(&pagelist)) { | 1028 | if (!list_empty(&pagelist)) { |
963 | err = migrate_pages(&pagelist, new_node_page, dest, | 1029 | err = migrate_pages(&pagelist, new_node_page, dest, |
964 | false, MIGRATE_SYNC); | 1030 | false, MIGRATE_SYNC, |
1031 | MR_SYSCALL); | ||
965 | if (err) | 1032 | if (err) |
966 | putback_lru_pages(&pagelist); | 1033 | putback_lru_pages(&pagelist); |
967 | } | 1034 | } |
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1133 | int err; | 1200 | int err; |
1134 | LIST_HEAD(pagelist); | 1201 | LIST_HEAD(pagelist); |
1135 | 1202 | ||
1136 | if (flags & ~(unsigned long)(MPOL_MF_STRICT | | 1203 | if (flags & ~(unsigned long)MPOL_MF_VALID) |
1137 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
1138 | return -EINVAL; | 1204 | return -EINVAL; |
1139 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | 1205 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
1140 | return -EPERM; | 1206 | return -EPERM; |
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1157 | if (IS_ERR(new)) | 1223 | if (IS_ERR(new)) |
1158 | return PTR_ERR(new); | 1224 | return PTR_ERR(new); |
1159 | 1225 | ||
1226 | if (flags & MPOL_MF_LAZY) | ||
1227 | new->flags |= MPOL_F_MOF; | ||
1228 | |||
1160 | /* | 1229 | /* |
1161 | * If we are using the default policy then operation | 1230 | * If we are using the default policy then operation |
1162 | * on discontinuous address spaces is okay after all | 1231 | * on discontinuous address spaces is okay after all |
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1193 | vma = check_range(mm, start, end, nmask, | 1262 | vma = check_range(mm, start, end, nmask, |
1194 | flags | MPOL_MF_INVERT, &pagelist); | 1263 | flags | MPOL_MF_INVERT, &pagelist); |
1195 | 1264 | ||
1196 | err = PTR_ERR(vma); | 1265 | err = PTR_ERR(vma); /* maybe ... */ |
1197 | if (!IS_ERR(vma)) { | 1266 | if (!IS_ERR(vma)) |
1198 | int nr_failed = 0; | ||
1199 | |||
1200 | err = mbind_range(mm, start, end, new); | 1267 | err = mbind_range(mm, start, end, new); |
1201 | 1268 | ||
1269 | if (!err) { | ||
1270 | int nr_failed = 0; | ||
1271 | |||
1202 | if (!list_empty(&pagelist)) { | 1272 | if (!list_empty(&pagelist)) { |
1273 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | ||
1203 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1274 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1204 | (unsigned long)vma, | 1275 | (unsigned long)vma, |
1205 | false, MIGRATE_SYNC); | 1276 | false, MIGRATE_SYNC, |
1277 | MR_MEMPOLICY_MBIND); | ||
1206 | if (nr_failed) | 1278 | if (nr_failed) |
1207 | putback_lru_pages(&pagelist); | 1279 | putback_lru_pages(&pagelist); |
1208 | } | 1280 | } |
1209 | 1281 | ||
1210 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1282 | if (nr_failed && (flags & MPOL_MF_STRICT)) |
1211 | err = -EIO; | 1283 | err = -EIO; |
1212 | } else | 1284 | } else |
1213 | putback_lru_pages(&pagelist); | 1285 | putback_lru_pages(&pagelist); |
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1388 | goto out_put; | 1460 | goto out_put; |
1389 | } | 1461 | } |
1390 | 1462 | ||
1391 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { | 1463 | if (!nodes_subset(*new, node_states[N_MEMORY])) { |
1392 | err = -EINVAL; | 1464 | err = -EINVAL; |
1393 | goto out_put; | 1465 | goto out_put; |
1394 | } | 1466 | } |
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1546 | struct mempolicy *get_vma_policy(struct task_struct *task, | 1618 | struct mempolicy *get_vma_policy(struct task_struct *task, |
1547 | struct vm_area_struct *vma, unsigned long addr) | 1619 | struct vm_area_struct *vma, unsigned long addr) |
1548 | { | 1620 | { |
1549 | struct mempolicy *pol = task->mempolicy; | 1621 | struct mempolicy *pol = get_task_policy(task); |
1550 | 1622 | ||
1551 | if (vma) { | 1623 | if (vma) { |
1552 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1624 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1907 | unsigned long addr, int node) | 1979 | unsigned long addr, int node) |
1908 | { | 1980 | { |
1909 | struct mempolicy *pol; | 1981 | struct mempolicy *pol; |
1910 | struct zonelist *zl; | ||
1911 | struct page *page; | 1982 | struct page *page; |
1912 | unsigned int cpuset_mems_cookie; | 1983 | unsigned int cpuset_mems_cookie; |
1913 | 1984 | ||
@@ -1926,23 +1997,11 @@ retry_cpuset: | |||
1926 | 1997 | ||
1927 | return page; | 1998 | return page; |
1928 | } | 1999 | } |
1929 | zl = policy_zonelist(gfp, pol, node); | 2000 | page = __alloc_pages_nodemask(gfp, order, |
1930 | if (unlikely(mpol_needs_cond_ref(pol))) { | 2001 | policy_zonelist(gfp, pol, node), |
1931 | /* | ||
1932 | * slow path: ref counted shared policy | ||
1933 | */ | ||
1934 | struct page *page = __alloc_pages_nodemask(gfp, order, | ||
1935 | zl, policy_nodemask(gfp, pol)); | ||
1936 | __mpol_put(pol); | ||
1937 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1938 | goto retry_cpuset; | ||
1939 | return page; | ||
1940 | } | ||
1941 | /* | ||
1942 | * fast path: default or task policy | ||
1943 | */ | ||
1944 | page = __alloc_pages_nodemask(gfp, order, zl, | ||
1945 | policy_nodemask(gfp, pol)); | 2002 | policy_nodemask(gfp, pol)); |
2003 | if (unlikely(mpol_needs_cond_ref(pol))) | ||
2004 | __mpol_put(pol); | ||
1946 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2005 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1947 | goto retry_cpuset; | 2006 | goto retry_cpuset; |
1948 | return page; | 2007 | return page; |
@@ -1969,7 +2028,7 @@ retry_cpuset: | |||
1969 | */ | 2028 | */ |
1970 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 2029 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
1971 | { | 2030 | { |
1972 | struct mempolicy *pol = current->mempolicy; | 2031 | struct mempolicy *pol = get_task_policy(current); |
1973 | struct page *page; | 2032 | struct page *page; |
1974 | unsigned int cpuset_mems_cookie; | 2033 | unsigned int cpuset_mems_cookie; |
1975 | 2034 | ||
@@ -2153,6 +2212,115 @@ static void sp_free(struct sp_node *n) | |||
2153 | kmem_cache_free(sn_cache, n); | 2212 | kmem_cache_free(sn_cache, n); |
2154 | } | 2213 | } |
2155 | 2214 | ||
2215 | /** | ||
2216 | * mpol_misplaced - check whether current page node is valid in policy | ||
2217 | * | ||
2218 | * @page - page to be checked | ||
2219 | * @vma - vm area where page mapped | ||
2220 | * @addr - virtual address where page mapped | ||
2221 | * | ||
2222 | * Lookup current policy node id for vma,addr and "compare to" page's | ||
2223 | * node id. | ||
2224 | * | ||
2225 | * Returns: | ||
2226 | * -1 - not misplaced, page is in the right node | ||
2227 | * node - node id where the page should be | ||
2228 | * | ||
2229 | * Policy determination "mimics" alloc_page_vma(). | ||
2230 | * Called from fault path where we know the vma and faulting address. | ||
2231 | */ | ||
2232 | int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) | ||
2233 | { | ||
2234 | struct mempolicy *pol; | ||
2235 | struct zone *zone; | ||
2236 | int curnid = page_to_nid(page); | ||
2237 | unsigned long pgoff; | ||
2238 | int polnid = -1; | ||
2239 | int ret = -1; | ||
2240 | |||
2241 | BUG_ON(!vma); | ||
2242 | |||
2243 | pol = get_vma_policy(current, vma, addr); | ||
2244 | if (!(pol->flags & MPOL_F_MOF)) | ||
2245 | goto out; | ||
2246 | |||
2247 | switch (pol->mode) { | ||
2248 | case MPOL_INTERLEAVE: | ||
2249 | BUG_ON(addr >= vma->vm_end); | ||
2250 | BUG_ON(addr < vma->vm_start); | ||
2251 | |||
2252 | pgoff = vma->vm_pgoff; | ||
2253 | pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
2254 | polnid = offset_il_node(pol, vma, pgoff); | ||
2255 | break; | ||
2256 | |||
2257 | case MPOL_PREFERRED: | ||
2258 | if (pol->flags & MPOL_F_LOCAL) | ||
2259 | polnid = numa_node_id(); | ||
2260 | else | ||
2261 | polnid = pol->v.preferred_node; | ||
2262 | break; | ||
2263 | |||
2264 | case MPOL_BIND: | ||
2265 | /* | ||
2266 | * allows binding to multiple nodes. | ||
2267 | * use current page if in policy nodemask, | ||
2268 | * else select nearest allowed node, if any. | ||
2269 | * If no allowed nodes, use current [!misplaced]. | ||
2270 | */ | ||
2271 | if (node_isset(curnid, pol->v.nodes)) | ||
2272 | goto out; | ||
2273 | (void)first_zones_zonelist( | ||
2274 | node_zonelist(numa_node_id(), GFP_HIGHUSER), | ||
2275 | gfp_zone(GFP_HIGHUSER), | ||
2276 | &pol->v.nodes, &zone); | ||
2277 | polnid = zone->node; | ||
2278 | break; | ||
2279 | |||
2280 | default: | ||
2281 | BUG(); | ||
2282 | } | ||
2283 | |||
2284 | /* Migrate the page towards the node whose CPU is referencing it */ | ||
2285 | if (pol->flags & MPOL_F_MORON) { | ||
2286 | int last_nid; | ||
2287 | |||
2288 | polnid = numa_node_id(); | ||
2289 | |||
2290 | /* | ||
2291 | * Multi-stage node selection is used in conjunction | ||
2292 | * with a periodic migration fault to build a temporal | ||
2293 | * task<->page relation. By using a two-stage filter we | ||
2294 | * remove short/unlikely relations. | ||
2295 | * | ||
2296 | * Using P(p) ~ n_p / n_t as per frequentist | ||
2297 | * probability, we can equate a task's usage of a | ||
2298 | * particular page (n_p) per total usage of this | ||
2299 | * page (n_t) (in a given time-span) to a probability. | ||
2300 | * | ||
2301 | * Our periodic faults will sample this probability and | ||
2302 | * getting the same result twice in a row, given these | ||
2303 | * samples are fully independent, is then given by | ||
2304 | * P(n)^2, provided our sample period is sufficiently | ||
2305 | * short compared to the usage pattern. | ||
2306 | * | ||
2307 | * This quadric squishes small probabilities, making | ||
2308 | * it less likely we act on an unlikely task<->page | ||
2309 | * relation. | ||
2310 | */ | ||
2311 | last_nid = page_xchg_last_nid(page, polnid); | ||
2312 | if (last_nid != polnid) | ||
2313 | goto out; | ||
2314 | } | ||
2315 | |||
2316 | if (curnid != polnid) | ||
2317 | ret = polnid; | ||
2318 | out: | ||
2319 | mpol_cond_put(pol); | ||
2320 | |||
2321 | return ret; | ||
2322 | } | ||
2323 | |||
2156 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2324 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2157 | { | 2325 | { |
2158 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2326 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
@@ -2318,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2318 | mutex_unlock(&p->mutex); | 2486 | mutex_unlock(&p->mutex); |
2319 | } | 2487 | } |
2320 | 2488 | ||
2489 | #ifdef CONFIG_NUMA_BALANCING | ||
2490 | static bool __initdata numabalancing_override; | ||
2491 | |||
2492 | static void __init check_numabalancing_enable(void) | ||
2493 | { | ||
2494 | bool numabalancing_default = false; | ||
2495 | |||
2496 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) | ||
2497 | numabalancing_default = true; | ||
2498 | |||
2499 | if (nr_node_ids > 1 && !numabalancing_override) { | ||
2500 | printk(KERN_INFO "Enabling automatic NUMA balancing. " | ||
2501 | "Configure with numa_balancing= or sysctl"); | ||
2502 | set_numabalancing_state(numabalancing_default); | ||
2503 | } | ||
2504 | } | ||
2505 | |||
2506 | static int __init setup_numabalancing(char *str) | ||
2507 | { | ||
2508 | int ret = 0; | ||
2509 | if (!str) | ||
2510 | goto out; | ||
2511 | numabalancing_override = true; | ||
2512 | |||
2513 | if (!strcmp(str, "enable")) { | ||
2514 | set_numabalancing_state(true); | ||
2515 | ret = 1; | ||
2516 | } else if (!strcmp(str, "disable")) { | ||
2517 | set_numabalancing_state(false); | ||
2518 | ret = 1; | ||
2519 | } | ||
2520 | out: | ||
2521 | if (!ret) | ||
2522 | printk(KERN_WARNING "Unable to parse numa_balancing=\n"); | ||
2523 | |||
2524 | return ret; | ||
2525 | } | ||
2526 | __setup("numa_balancing=", setup_numabalancing); | ||
2527 | #else | ||
2528 | static inline void __init check_numabalancing_enable(void) | ||
2529 | { | ||
2530 | } | ||
2531 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2532 | |||
2321 | /* assumes fs == KERNEL_DS */ | 2533 | /* assumes fs == KERNEL_DS */ |
2322 | void __init numa_policy_init(void) | 2534 | void __init numa_policy_init(void) |
2323 | { | 2535 | { |
@@ -2333,13 +2545,22 @@ void __init numa_policy_init(void) | |||
2333 | sizeof(struct sp_node), | 2545 | sizeof(struct sp_node), |
2334 | 0, SLAB_PANIC, NULL); | 2546 | 0, SLAB_PANIC, NULL); |
2335 | 2547 | ||
2548 | for_each_node(nid) { | ||
2549 | preferred_node_policy[nid] = (struct mempolicy) { | ||
2550 | .refcnt = ATOMIC_INIT(1), | ||
2551 | .mode = MPOL_PREFERRED, | ||
2552 | .flags = MPOL_F_MOF | MPOL_F_MORON, | ||
2553 | .v = { .preferred_node = nid, }, | ||
2554 | }; | ||
2555 | } | ||
2556 | |||
2336 | /* | 2557 | /* |
2337 | * Set interleaving policy for system init. Interleaving is only | 2558 | * Set interleaving policy for system init. Interleaving is only |
2338 | * enabled across suitably sized nodes (default is >= 16MB), or | 2559 | * enabled across suitably sized nodes (default is >= 16MB), or |
2339 | * fall back to the largest node if they're all smaller. | 2560 | * fall back to the largest node if they're all smaller. |
2340 | */ | 2561 | */ |
2341 | nodes_clear(interleave_nodes); | 2562 | nodes_clear(interleave_nodes); |
2342 | for_each_node_state(nid, N_HIGH_MEMORY) { | 2563 | for_each_node_state(nid, N_MEMORY) { |
2343 | unsigned long total_pages = node_present_pages(nid); | 2564 | unsigned long total_pages = node_present_pages(nid); |
2344 | 2565 | ||
2345 | /* Preserve the largest node */ | 2566 | /* Preserve the largest node */ |
@@ -2359,6 +2580,8 @@ void __init numa_policy_init(void) | |||
2359 | 2580 | ||
2360 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) | 2581 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) |
2361 | printk("numa_policy_init: interleaving failed\n"); | 2582 | printk("numa_policy_init: interleaving failed\n"); |
2583 | |||
2584 | check_numabalancing_enable(); | ||
2362 | } | 2585 | } |
2363 | 2586 | ||
2364 | /* Reset policy of current process to default */ | 2587 | /* Reset policy of current process to default */ |
@@ -2375,14 +2598,13 @@ void numa_default_policy(void) | |||
2375 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag | 2598 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag |
2376 | * Used only for mpol_parse_str() and mpol_to_str() | 2599 | * Used only for mpol_parse_str() and mpol_to_str() |
2377 | */ | 2600 | */ |
2378 | #define MPOL_LOCAL MPOL_MAX | ||
2379 | static const char * const policy_modes[] = | 2601 | static const char * const policy_modes[] = |
2380 | { | 2602 | { |
2381 | [MPOL_DEFAULT] = "default", | 2603 | [MPOL_DEFAULT] = "default", |
2382 | [MPOL_PREFERRED] = "prefer", | 2604 | [MPOL_PREFERRED] = "prefer", |
2383 | [MPOL_BIND] = "bind", | 2605 | [MPOL_BIND] = "bind", |
2384 | [MPOL_INTERLEAVE] = "interleave", | 2606 | [MPOL_INTERLEAVE] = "interleave", |
2385 | [MPOL_LOCAL] = "local" | 2607 | [MPOL_LOCAL] = "local", |
2386 | }; | 2608 | }; |
2387 | 2609 | ||
2388 | 2610 | ||
@@ -2420,7 +2642,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2420 | *nodelist++ = '\0'; | 2642 | *nodelist++ = '\0'; |
2421 | if (nodelist_parse(nodelist, nodes)) | 2643 | if (nodelist_parse(nodelist, nodes)) |
2422 | goto out; | 2644 | goto out; |
2423 | if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) | 2645 | if (!nodes_subset(nodes, node_states[N_MEMORY])) |
2424 | goto out; | 2646 | goto out; |
2425 | } else | 2647 | } else |
2426 | nodes_clear(nodes); | 2648 | nodes_clear(nodes); |
@@ -2428,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2428 | if (flags) | 2650 | if (flags) |
2429 | *flags++ = '\0'; /* terminate mode string */ | 2651 | *flags++ = '\0'; /* terminate mode string */ |
2430 | 2652 | ||
2431 | for (mode = 0; mode <= MPOL_LOCAL; mode++) { | 2653 | for (mode = 0; mode < MPOL_MAX; mode++) { |
2432 | if (!strcmp(str, policy_modes[mode])) { | 2654 | if (!strcmp(str, policy_modes[mode])) { |
2433 | break; | 2655 | break; |
2434 | } | 2656 | } |
2435 | } | 2657 | } |
2436 | if (mode > MPOL_LOCAL) | 2658 | if (mode >= MPOL_MAX) |
2437 | goto out; | 2659 | goto out; |
2438 | 2660 | ||
2439 | switch (mode) { | 2661 | switch (mode) { |
@@ -2454,7 +2676,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2454 | * Default to online nodes with memory if no nodelist | 2676 | * Default to online nodes with memory if no nodelist |
2455 | */ | 2677 | */ |
2456 | if (!nodelist) | 2678 | if (!nodelist) |
2457 | nodes = node_states[N_HIGH_MEMORY]; | 2679 | nodes = node_states[N_MEMORY]; |
2458 | break; | 2680 | break; |
2459 | case MPOL_LOCAL: | 2681 | case MPOL_LOCAL: |
2460 | /* | 2682 | /* |
diff --git a/mm/migrate.c b/mm/migrate.c index 77ed2d773705..3b676b0c5c3e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -35,9 +35,13 @@ | |||
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | 36 | #include <linux/hugetlb_cgroup.h> |
37 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
38 | #include <linux/balloon_compaction.h> | ||
38 | 39 | ||
39 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
40 | 41 | ||
42 | #define CREATE_TRACE_POINTS | ||
43 | #include <trace/events/migrate.h> | ||
44 | |||
41 | #include "internal.h" | 45 | #include "internal.h" |
42 | 46 | ||
43 | /* | 47 | /* |
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l) | |||
79 | list_del(&page->lru); | 83 | list_del(&page->lru); |
80 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 84 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
81 | page_is_file_cache(page)); | 85 | page_is_file_cache(page)); |
82 | putback_lru_page(page); | 86 | putback_lru_page(page); |
87 | } | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * Put previously isolated pages back onto the appropriate lists | ||
92 | * from where they were once taken off for compaction/migration. | ||
93 | * | ||
94 | * This function shall be used instead of putback_lru_pages(), | ||
95 | * whenever the isolated pageset has been built by isolate_migratepages_range() | ||
96 | */ | ||
97 | void putback_movable_pages(struct list_head *l) | ||
98 | { | ||
99 | struct page *page; | ||
100 | struct page *page2; | ||
101 | |||
102 | list_for_each_entry_safe(page, page2, l, lru) { | ||
103 | list_del(&page->lru); | ||
104 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
105 | page_is_file_cache(page)); | ||
106 | if (unlikely(balloon_page_movable(page))) | ||
107 | balloon_page_putback(page); | ||
108 | else | ||
109 | putback_lru_page(page); | ||
83 | } | 110 | } |
84 | } | 111 | } |
85 | 112 | ||
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
91 | { | 118 | { |
92 | struct mm_struct *mm = vma->vm_mm; | 119 | struct mm_struct *mm = vma->vm_mm; |
93 | swp_entry_t entry; | 120 | swp_entry_t entry; |
94 | pgd_t *pgd; | ||
95 | pud_t *pud; | ||
96 | pmd_t *pmd; | 121 | pmd_t *pmd; |
97 | pte_t *ptep, pte; | 122 | pte_t *ptep, pte; |
98 | spinlock_t *ptl; | 123 | spinlock_t *ptl; |
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
103 | goto out; | 128 | goto out; |
104 | ptl = &mm->page_table_lock; | 129 | ptl = &mm->page_table_lock; |
105 | } else { | 130 | } else { |
106 | pgd = pgd_offset(mm, addr); | 131 | pmd = mm_find_pmd(mm, addr); |
107 | if (!pgd_present(*pgd)) | 132 | if (!pmd) |
108 | goto out; | ||
109 | |||
110 | pud = pud_offset(pgd, addr); | ||
111 | if (!pud_present(*pud)) | ||
112 | goto out; | 133 | goto out; |
113 | |||
114 | pmd = pmd_offset(pud, addr); | ||
115 | if (pmd_trans_huge(*pmd)) | 134 | if (pmd_trans_huge(*pmd)) |
116 | goto out; | 135 | goto out; |
117 | if (!pmd_present(*pmd)) | ||
118 | goto out; | ||
119 | 136 | ||
120 | ptep = pte_offset_map(pmd, addr); | 137 | ptep = pte_offset_map(pmd, addr); |
121 | 138 | ||
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
279 | struct page *newpage, struct page *page, | 296 | struct page *newpage, struct page *page, |
280 | struct buffer_head *head, enum migrate_mode mode) | 297 | struct buffer_head *head, enum migrate_mode mode) |
281 | { | 298 | { |
282 | int expected_count; | 299 | int expected_count = 0; |
283 | void **pslot; | 300 | void **pslot; |
284 | 301 | ||
285 | if (!mapping) { | 302 | if (!mapping) { |
286 | /* Anonymous page without mapping */ | 303 | /* Anonymous page without mapping */ |
287 | if (page_count(page) != 1) | 304 | if (page_count(page) != 1) |
288 | return -EAGAIN; | 305 | return -EAGAIN; |
289 | return 0; | 306 | return MIGRATEPAGE_SUCCESS; |
290 | } | 307 | } |
291 | 308 | ||
292 | spin_lock_irq(&mapping->tree_lock); | 309 | spin_lock_irq(&mapping->tree_lock); |
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
356 | } | 373 | } |
357 | spin_unlock_irq(&mapping->tree_lock); | 374 | spin_unlock_irq(&mapping->tree_lock); |
358 | 375 | ||
359 | return 0; | 376 | return MIGRATEPAGE_SUCCESS; |
360 | } | 377 | } |
361 | 378 | ||
362 | /* | 379 | /* |
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
372 | if (!mapping) { | 389 | if (!mapping) { |
373 | if (page_count(page) != 1) | 390 | if (page_count(page) != 1) |
374 | return -EAGAIN; | 391 | return -EAGAIN; |
375 | return 0; | 392 | return MIGRATEPAGE_SUCCESS; |
376 | } | 393 | } |
377 | 394 | ||
378 | spin_lock_irq(&mapping->tree_lock); | 395 | spin_lock_irq(&mapping->tree_lock); |
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
399 | page_unfreeze_refs(page, expected_count - 1); | 416 | page_unfreeze_refs(page, expected_count - 1); |
400 | 417 | ||
401 | spin_unlock_irq(&mapping->tree_lock); | 418 | spin_unlock_irq(&mapping->tree_lock); |
402 | return 0; | 419 | return MIGRATEPAGE_SUCCESS; |
403 | } | 420 | } |
404 | 421 | ||
405 | /* | 422 | /* |
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
407 | */ | 424 | */ |
408 | void migrate_page_copy(struct page *newpage, struct page *page) | 425 | void migrate_page_copy(struct page *newpage, struct page *page) |
409 | { | 426 | { |
410 | if (PageHuge(page)) | 427 | if (PageHuge(page) || PageTransHuge(page)) |
411 | copy_huge_page(newpage, page); | 428 | copy_huge_page(newpage, page); |
412 | else | 429 | else |
413 | copy_highpage(newpage, page); | 430 | copy_highpage(newpage, page); |
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping, | |||
486 | 503 | ||
487 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); | 504 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); |
488 | 505 | ||
489 | if (rc) | 506 | if (rc != MIGRATEPAGE_SUCCESS) |
490 | return rc; | 507 | return rc; |
491 | 508 | ||
492 | migrate_page_copy(newpage, page); | 509 | migrate_page_copy(newpage, page); |
493 | return 0; | 510 | return MIGRATEPAGE_SUCCESS; |
494 | } | 511 | } |
495 | EXPORT_SYMBOL(migrate_page); | 512 | EXPORT_SYMBOL(migrate_page); |
496 | 513 | ||
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping, | |||
513 | 530 | ||
514 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); | 531 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); |
515 | 532 | ||
516 | if (rc) | 533 | if (rc != MIGRATEPAGE_SUCCESS) |
517 | return rc; | 534 | return rc; |
518 | 535 | ||
519 | /* | 536 | /* |
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping, | |||
549 | 566 | ||
550 | } while (bh != head); | 567 | } while (bh != head); |
551 | 568 | ||
552 | return 0; | 569 | return MIGRATEPAGE_SUCCESS; |
553 | } | 570 | } |
554 | EXPORT_SYMBOL(buffer_migrate_page); | 571 | EXPORT_SYMBOL(buffer_migrate_page); |
555 | #endif | 572 | #endif |
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
628 | * | 645 | * |
629 | * Return value: | 646 | * Return value: |
630 | * < 0 - error code | 647 | * < 0 - error code |
631 | * == 0 - success | 648 | * MIGRATEPAGE_SUCCESS - success |
632 | */ | 649 | */ |
633 | static int move_to_new_page(struct page *newpage, struct page *page, | 650 | static int move_to_new_page(struct page *newpage, struct page *page, |
634 | int remap_swapcache, enum migrate_mode mode) | 651 | int remap_swapcache, enum migrate_mode mode) |
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
665 | else | 682 | else |
666 | rc = fallback_migrate_page(mapping, newpage, page, mode); | 683 | rc = fallback_migrate_page(mapping, newpage, page, mode); |
667 | 684 | ||
668 | if (rc) { | 685 | if (rc != MIGRATEPAGE_SUCCESS) { |
669 | newpage->mapping = NULL; | 686 | newpage->mapping = NULL; |
670 | } else { | 687 | } else { |
671 | if (remap_swapcache) | 688 | if (remap_swapcache) |
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
751 | */ | 768 | */ |
752 | if (PageAnon(page)) { | 769 | if (PageAnon(page)) { |
753 | /* | 770 | /* |
754 | * Only page_lock_anon_vma() understands the subtleties of | 771 | * Only page_lock_anon_vma_read() understands the subtleties of |
755 | * getting a hold on an anon_vma from outside one of its mms. | 772 | * getting a hold on an anon_vma from outside one of its mms. |
756 | */ | 773 | */ |
757 | anon_vma = page_get_anon_vma(page); | 774 | anon_vma = page_get_anon_vma(page); |
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
778 | } | 795 | } |
779 | } | 796 | } |
780 | 797 | ||
798 | if (unlikely(balloon_page_movable(page))) { | ||
799 | /* | ||
800 | * A ballooned page does not need any special attention from | ||
801 | * physical to virtual reverse mapping procedures. | ||
802 | * Skip any attempt to unmap PTEs or to remap swap cache, | ||
803 | * in order to avoid burning cycles at rmap level, and perform | ||
804 | * the page migration right away (proteced by page lock). | ||
805 | */ | ||
806 | rc = balloon_page_migrate(newpage, page, mode); | ||
807 | goto uncharge; | ||
808 | } | ||
809 | |||
781 | /* | 810 | /* |
782 | * Corner case handling: | 811 | * Corner case handling: |
783 | * 1. When a new swap-cache page is read into, it is added to the LRU | 812 | * 1. When a new swap-cache page is read into, it is added to the LRU |
@@ -814,7 +843,9 @@ skip_unmap: | |||
814 | put_anon_vma(anon_vma); | 843 | put_anon_vma(anon_vma); |
815 | 844 | ||
816 | uncharge: | 845 | uncharge: |
817 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | 846 | mem_cgroup_end_migration(mem, page, newpage, |
847 | (rc == MIGRATEPAGE_SUCCESS || | ||
848 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); | ||
818 | unlock: | 849 | unlock: |
819 | unlock_page(page); | 850 | unlock_page(page); |
820 | out: | 851 | out: |
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
846 | goto out; | 877 | goto out; |
847 | 878 | ||
848 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | 879 | rc = __unmap_and_move(page, newpage, force, offlining, mode); |
880 | |||
881 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | ||
882 | /* | ||
883 | * A ballooned page has been migrated already. | ||
884 | * Now, it's the time to wrap-up counters, | ||
885 | * handle the page back to Buddy and return. | ||
886 | */ | ||
887 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
888 | page_is_file_cache(page)); | ||
889 | balloon_page_free(page); | ||
890 | return MIGRATEPAGE_SUCCESS; | ||
891 | } | ||
849 | out: | 892 | out: |
850 | if (rc != -EAGAIN) { | 893 | if (rc != -EAGAIN) { |
851 | /* | 894 | /* |
@@ -958,10 +1001,11 @@ out: | |||
958 | */ | 1001 | */ |
959 | int migrate_pages(struct list_head *from, | 1002 | int migrate_pages(struct list_head *from, |
960 | new_page_t get_new_page, unsigned long private, bool offlining, | 1003 | new_page_t get_new_page, unsigned long private, bool offlining, |
961 | enum migrate_mode mode) | 1004 | enum migrate_mode mode, int reason) |
962 | { | 1005 | { |
963 | int retry = 1; | 1006 | int retry = 1; |
964 | int nr_failed = 0; | 1007 | int nr_failed = 0; |
1008 | int nr_succeeded = 0; | ||
965 | int pass = 0; | 1009 | int pass = 0; |
966 | struct page *page; | 1010 | struct page *page; |
967 | struct page *page2; | 1011 | struct page *page2; |
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from, | |||
987 | case -EAGAIN: | 1031 | case -EAGAIN: |
988 | retry++; | 1032 | retry++; |
989 | break; | 1033 | break; |
990 | case 0: | 1034 | case MIGRATEPAGE_SUCCESS: |
1035 | nr_succeeded++; | ||
991 | break; | 1036 | break; |
992 | default: | 1037 | default: |
993 | /* Permanent failure */ | 1038 | /* Permanent failure */ |
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from, | |||
996 | } | 1041 | } |
997 | } | 1042 | } |
998 | } | 1043 | } |
999 | rc = 0; | 1044 | rc = nr_failed + retry; |
1000 | out: | 1045 | out: |
1046 | if (nr_succeeded) | ||
1047 | count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); | ||
1048 | if (nr_failed) | ||
1049 | count_vm_events(PGMIGRATE_FAIL, nr_failed); | ||
1050 | trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); | ||
1051 | |||
1001 | if (!swapwrite) | 1052 | if (!swapwrite) |
1002 | current->flags &= ~PF_SWAPWRITE; | 1053 | current->flags &= ~PF_SWAPWRITE; |
1003 | 1054 | ||
1004 | if (rc) | 1055 | return rc; |
1005 | return rc; | ||
1006 | |||
1007 | return nr_failed + retry; | ||
1008 | } | 1056 | } |
1009 | 1057 | ||
1010 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | 1058 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | |||
1024 | /* try again */ | 1072 | /* try again */ |
1025 | cond_resched(); | 1073 | cond_resched(); |
1026 | break; | 1074 | break; |
1027 | case 0: | 1075 | case MIGRATEPAGE_SUCCESS: |
1028 | goto out; | 1076 | goto out; |
1029 | default: | 1077 | default: |
1030 | rc = -EIO; | 1078 | rc = -EIO; |
@@ -1139,7 +1187,8 @@ set_status: | |||
1139 | err = 0; | 1187 | err = 0; |
1140 | if (!list_empty(&pagelist)) { | 1188 | if (!list_empty(&pagelist)) { |
1141 | err = migrate_pages(&pagelist, new_page_node, | 1189 | err = migrate_pages(&pagelist, new_page_node, |
1142 | (unsigned long)pm, 0, MIGRATE_SYNC); | 1190 | (unsigned long)pm, 0, MIGRATE_SYNC, |
1191 | MR_SYSCALL); | ||
1143 | if (err) | 1192 | if (err) |
1144 | putback_lru_pages(&pagelist); | 1193 | putback_lru_pages(&pagelist); |
1145 | } | 1194 | } |
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, | |||
1201 | if (node < 0 || node >= MAX_NUMNODES) | 1250 | if (node < 0 || node >= MAX_NUMNODES) |
1202 | goto out_pm; | 1251 | goto out_pm; |
1203 | 1252 | ||
1204 | if (!node_state(node, N_HIGH_MEMORY)) | 1253 | if (!node_state(node, N_MEMORY)) |
1205 | goto out_pm; | 1254 | goto out_pm; |
1206 | 1255 | ||
1207 | err = -EACCES; | 1256 | err = -EACCES; |
@@ -1403,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1403 | } | 1452 | } |
1404 | return err; | 1453 | return err; |
1405 | } | 1454 | } |
1406 | #endif | 1455 | |
1456 | #ifdef CONFIG_NUMA_BALANCING | ||
1457 | /* | ||
1458 | * Returns true if this is a safe migration target node for misplaced NUMA | ||
1459 | * pages. Currently it only checks the watermarks which crude | ||
1460 | */ | ||
1461 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | ||
1462 | int nr_migrate_pages) | ||
1463 | { | ||
1464 | int z; | ||
1465 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | ||
1466 | struct zone *zone = pgdat->node_zones + z; | ||
1467 | |||
1468 | if (!populated_zone(zone)) | ||
1469 | continue; | ||
1470 | |||
1471 | if (zone->all_unreclaimable) | ||
1472 | continue; | ||
1473 | |||
1474 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ | ||
1475 | if (!zone_watermark_ok(zone, 0, | ||
1476 | high_wmark_pages(zone) + | ||
1477 | nr_migrate_pages, | ||
1478 | 0, 0)) | ||
1479 | continue; | ||
1480 | return true; | ||
1481 | } | ||
1482 | return false; | ||
1483 | } | ||
1484 | |||
1485 | static struct page *alloc_misplaced_dst_page(struct page *page, | ||
1486 | unsigned long data, | ||
1487 | int **result) | ||
1488 | { | ||
1489 | int nid = (int) data; | ||
1490 | struct page *newpage; | ||
1491 | |||
1492 | newpage = alloc_pages_exact_node(nid, | ||
1493 | (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | | ||
1494 | __GFP_NOMEMALLOC | __GFP_NORETRY | | ||
1495 | __GFP_NOWARN) & | ||
1496 | ~GFP_IOFS, 0); | ||
1497 | if (newpage) | ||
1498 | page_xchg_last_nid(newpage, page_last_nid(page)); | ||
1499 | |||
1500 | return newpage; | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * page migration rate limiting control. | ||
1505 | * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs | ||
1506 | * window of time. Default here says do not migrate more than 1280M per second. | ||
1507 | * If a node is rate-limited then PTE NUMA updates are also rate-limited. However | ||
1508 | * as it is faults that reset the window, pte updates will happen unconditionally | ||
1509 | * if there has not been a fault since @pteupdate_interval_millisecs after the | ||
1510 | * throttle window closed. | ||
1511 | */ | ||
1512 | static unsigned int migrate_interval_millisecs __read_mostly = 100; | ||
1513 | static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; | ||
1514 | static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); | ||
1515 | |||
1516 | /* Returns true if NUMA migration is currently rate limited */ | ||
1517 | bool migrate_ratelimited(int node) | ||
1518 | { | ||
1519 | pg_data_t *pgdat = NODE_DATA(node); | ||
1520 | |||
1521 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + | ||
1522 | msecs_to_jiffies(pteupdate_interval_millisecs))) | ||
1523 | return false; | ||
1524 | |||
1525 | if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) | ||
1526 | return false; | ||
1527 | |||
1528 | return true; | ||
1529 | } | ||
1530 | |||
1531 | /* Returns true if the node is migrate rate-limited after the update */ | ||
1532 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | ||
1533 | { | ||
1534 | bool rate_limited = false; | ||
1535 | |||
1536 | /* | ||
1537 | * Rate-limit the amount of data that is being migrated to a node. | ||
1538 | * Optimal placement is no good if the memory bus is saturated and | ||
1539 | * all the time is being spent migrating! | ||
1540 | */ | ||
1541 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1542 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | ||
1543 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
1544 | pgdat->numabalancing_migrate_next_window = jiffies + | ||
1545 | msecs_to_jiffies(migrate_interval_millisecs); | ||
1546 | } | ||
1547 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | ||
1548 | rate_limited = true; | ||
1549 | else | ||
1550 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1551 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1552 | |||
1553 | return rate_limited; | ||
1554 | } | ||
1555 | |||
1556 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | ||
1557 | { | ||
1558 | int ret = 0; | ||
1559 | |||
1560 | /* Avoid migrating to a node that is nearly full */ | ||
1561 | if (migrate_balanced_pgdat(pgdat, 1)) { | ||
1562 | int page_lru; | ||
1563 | |||
1564 | if (isolate_lru_page(page)) { | ||
1565 | put_page(page); | ||
1566 | return 0; | ||
1567 | } | ||
1568 | |||
1569 | /* Page is isolated */ | ||
1570 | ret = 1; | ||
1571 | page_lru = page_is_file_cache(page); | ||
1572 | if (!PageTransHuge(page)) | ||
1573 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | ||
1574 | else | ||
1575 | mod_zone_page_state(page_zone(page), | ||
1576 | NR_ISOLATED_ANON + page_lru, | ||
1577 | HPAGE_PMD_NR); | ||
1578 | } | ||
1579 | |||
1580 | /* | ||
1581 | * Page is either isolated or there is not enough space on the target | ||
1582 | * node. If isolated, then it has taken a reference count and the | ||
1583 | * callers reference can be safely dropped without the page | ||
1584 | * disappearing underneath us during migration. Otherwise the page is | ||
1585 | * not to be migrated but the callers reference should still be | ||
1586 | * dropped so it does not leak. | ||
1587 | */ | ||
1588 | put_page(page); | ||
1589 | |||
1590 | return ret; | ||
1591 | } | ||
1592 | |||
1593 | /* | ||
1594 | * Attempt to migrate a misplaced page to the specified destination | ||
1595 | * node. Caller is expected to have an elevated reference count on | ||
1596 | * the page that will be dropped by this function before returning. | ||
1597 | */ | ||
1598 | int migrate_misplaced_page(struct page *page, int node) | ||
1599 | { | ||
1600 | pg_data_t *pgdat = NODE_DATA(node); | ||
1601 | int isolated = 0; | ||
1602 | int nr_remaining; | ||
1603 | LIST_HEAD(migratepages); | ||
1604 | |||
1605 | /* | ||
1606 | * Don't migrate pages that are mapped in multiple processes. | ||
1607 | * TODO: Handle false sharing detection instead of this hammer | ||
1608 | */ | ||
1609 | if (page_mapcount(page) != 1) { | ||
1610 | put_page(page); | ||
1611 | goto out; | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * Rate-limit the amount of data that is being migrated to a node. | ||
1616 | * Optimal placement is no good if the memory bus is saturated and | ||
1617 | * all the time is being spent migrating! | ||
1618 | */ | ||
1619 | if (numamigrate_update_ratelimit(pgdat, 1)) { | ||
1620 | put_page(page); | ||
1621 | goto out; | ||
1622 | } | ||
1623 | |||
1624 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1625 | if (!isolated) | ||
1626 | goto out; | ||
1627 | |||
1628 | list_add(&page->lru, &migratepages); | ||
1629 | nr_remaining = migrate_pages(&migratepages, | ||
1630 | alloc_misplaced_dst_page, | ||
1631 | node, false, MIGRATE_ASYNC, | ||
1632 | MR_NUMA_MISPLACED); | ||
1633 | if (nr_remaining) { | ||
1634 | putback_lru_pages(&migratepages); | ||
1635 | isolated = 0; | ||
1636 | } else | ||
1637 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | ||
1638 | BUG_ON(!list_empty(&migratepages)); | ||
1639 | out: | ||
1640 | return isolated; | ||
1641 | } | ||
1642 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1643 | |||
1644 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
1645 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | ||
1646 | struct vm_area_struct *vma, | ||
1647 | pmd_t *pmd, pmd_t entry, | ||
1648 | unsigned long address, | ||
1649 | struct page *page, int node) | ||
1650 | { | ||
1651 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
1652 | pg_data_t *pgdat = NODE_DATA(node); | ||
1653 | int isolated = 0; | ||
1654 | struct page *new_page = NULL; | ||
1655 | struct mem_cgroup *memcg = NULL; | ||
1656 | int page_lru = page_is_file_cache(page); | ||
1657 | |||
1658 | /* | ||
1659 | * Don't migrate pages that are mapped in multiple processes. | ||
1660 | * TODO: Handle false sharing detection instead of this hammer | ||
1661 | */ | ||
1662 | if (page_mapcount(page) != 1) | ||
1663 | goto out_dropref; | ||
1664 | |||
1665 | /* | ||
1666 | * Rate-limit the amount of data that is being migrated to a node. | ||
1667 | * Optimal placement is no good if the memory bus is saturated and | ||
1668 | * all the time is being spent migrating! | ||
1669 | */ | ||
1670 | if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) | ||
1671 | goto out_dropref; | ||
1672 | |||
1673 | new_page = alloc_pages_node(node, | ||
1674 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | ||
1675 | if (!new_page) { | ||
1676 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1677 | goto out_dropref; | ||
1678 | } | ||
1679 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
1680 | |||
1681 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1682 | if (!isolated) { | ||
1683 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1684 | put_page(new_page); | ||
1685 | goto out_keep_locked; | ||
1686 | } | ||
1687 | |||
1688 | /* Prepare a page as a migration target */ | ||
1689 | __set_page_locked(new_page); | ||
1690 | SetPageSwapBacked(new_page); | ||
1691 | |||
1692 | /* anon mapping, we can simply copy page->mapping to the new page: */ | ||
1693 | new_page->mapping = page->mapping; | ||
1694 | new_page->index = page->index; | ||
1695 | migrate_page_copy(new_page, page); | ||
1696 | WARN_ON(PageLRU(new_page)); | ||
1697 | |||
1698 | /* Recheck the target PMD */ | ||
1699 | spin_lock(&mm->page_table_lock); | ||
1700 | if (unlikely(!pmd_same(*pmd, entry))) { | ||
1701 | spin_unlock(&mm->page_table_lock); | ||
1702 | |||
1703 | /* Reverse changes made by migrate_page_copy() */ | ||
1704 | if (TestClearPageActive(new_page)) | ||
1705 | SetPageActive(page); | ||
1706 | if (TestClearPageUnevictable(new_page)) | ||
1707 | SetPageUnevictable(page); | ||
1708 | mlock_migrate_page(page, new_page); | ||
1709 | |||
1710 | unlock_page(new_page); | ||
1711 | put_page(new_page); /* Free it */ | ||
1712 | |||
1713 | unlock_page(page); | ||
1714 | putback_lru_page(page); | ||
1715 | |||
1716 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1717 | goto out; | ||
1718 | } | ||
1719 | |||
1720 | /* | ||
1721 | * Traditional migration needs to prepare the memcg charge | ||
1722 | * transaction early to prevent the old page from being | ||
1723 | * uncharged when installing migration entries. Here we can | ||
1724 | * save the potential rollback and start the charge transfer | ||
1725 | * only when migration is already known to end successfully. | ||
1726 | */ | ||
1727 | mem_cgroup_prepare_migration(page, new_page, &memcg); | ||
1728 | |||
1729 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
1730 | entry = pmd_mknonnuma(entry); | ||
1731 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
1732 | entry = pmd_mkhuge(entry); | ||
1733 | |||
1734 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
1735 | |||
1736 | set_pmd_at(mm, haddr, pmd, entry); | ||
1737 | update_mmu_cache_pmd(vma, address, &entry); | ||
1738 | page_remove_rmap(page); | ||
1739 | /* | ||
1740 | * Finish the charge transaction under the page table lock to | ||
1741 | * prevent split_huge_page() from dividing up the charge | ||
1742 | * before it's fully transferred to the new page. | ||
1743 | */ | ||
1744 | mem_cgroup_end_migration(memcg, page, new_page, true); | ||
1745 | spin_unlock(&mm->page_table_lock); | ||
1746 | |||
1747 | unlock_page(new_page); | ||
1748 | unlock_page(page); | ||
1749 | put_page(page); /* Drop the rmap reference */ | ||
1750 | put_page(page); /* Drop the LRU isolation reference */ | ||
1751 | |||
1752 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | ||
1753 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | ||
1754 | |||
1755 | out: | ||
1756 | mod_zone_page_state(page_zone(page), | ||
1757 | NR_ISOLATED_ANON + page_lru, | ||
1758 | -HPAGE_PMD_NR); | ||
1759 | return isolated; | ||
1760 | |||
1761 | out_dropref: | ||
1762 | put_page(page); | ||
1763 | out_keep_locked: | ||
1764 | return 0; | ||
1765 | } | ||
1766 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1767 | |||
1768 | #endif /* CONFIG_NUMA */ | ||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/audit.h> | 31 | #include <linux/audit.h> |
32 | #include <linux/khugepaged.h> | 32 | #include <linux/khugepaged.h> |
33 | #include <linux/uprobes.h> | 33 | #include <linux/uprobes.h> |
34 | #include <linux/rbtree_augmented.h> | ||
34 | 35 | ||
35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
36 | #include <asm/cacheflush.h> | 37 | #include <asm/cacheflush.h> |
@@ -89,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | |||
89 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; | 90 | struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; |
90 | 91 | ||
91 | /* | 92 | /* |
93 | * The global memory commitment made in the system can be a metric | ||
94 | * that can be used to drive ballooning decisions when Linux is hosted | ||
95 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically | ||
96 | * balancing memory across competing virtual machines that are hosted. | ||
97 | * Several metrics drive this policy engine including the guest reported | ||
98 | * memory commitment. | ||
99 | */ | ||
100 | unsigned long vm_memory_committed(void) | ||
101 | { | ||
102 | return percpu_counter_read_positive(&vm_committed_as); | ||
103 | } | ||
104 | EXPORT_SYMBOL_GPL(vm_memory_committed); | ||
105 | |||
106 | /* | ||
92 | * Check that a process has enough memory to allocate a new virtual | 107 | * Check that a process has enough memory to allocate a new virtual |
93 | * mapping. 0 means there is enough memory for the allocation to | 108 | * mapping. 0 means there is enough memory for the allocation to |
94 | * succeed and -ENOMEM implies there is not. | 109 | * succeed and -ENOMEM implies there is not. |
@@ -297,40 +312,88 @@ out: | |||
297 | return retval; | 312 | return retval; |
298 | } | 313 | } |
299 | 314 | ||
315 | static long vma_compute_subtree_gap(struct vm_area_struct *vma) | ||
316 | { | ||
317 | unsigned long max, subtree_gap; | ||
318 | max = vma->vm_start; | ||
319 | if (vma->vm_prev) | ||
320 | max -= vma->vm_prev->vm_end; | ||
321 | if (vma->vm_rb.rb_left) { | ||
322 | subtree_gap = rb_entry(vma->vm_rb.rb_left, | ||
323 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | ||
324 | if (subtree_gap > max) | ||
325 | max = subtree_gap; | ||
326 | } | ||
327 | if (vma->vm_rb.rb_right) { | ||
328 | subtree_gap = rb_entry(vma->vm_rb.rb_right, | ||
329 | struct vm_area_struct, vm_rb)->rb_subtree_gap; | ||
330 | if (subtree_gap > max) | ||
331 | max = subtree_gap; | ||
332 | } | ||
333 | return max; | ||
334 | } | ||
335 | |||
300 | #ifdef CONFIG_DEBUG_VM_RB | 336 | #ifdef CONFIG_DEBUG_VM_RB |
301 | static int browse_rb(struct rb_root *root) | 337 | static int browse_rb(struct rb_root *root) |
302 | { | 338 | { |
303 | int i = 0, j; | 339 | int i = 0, j, bug = 0; |
304 | struct rb_node *nd, *pn = NULL; | 340 | struct rb_node *nd, *pn = NULL; |
305 | unsigned long prev = 0, pend = 0; | 341 | unsigned long prev = 0, pend = 0; |
306 | 342 | ||
307 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 343 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
308 | struct vm_area_struct *vma; | 344 | struct vm_area_struct *vma; |
309 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 345 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
310 | if (vma->vm_start < prev) | 346 | if (vma->vm_start < prev) { |
311 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; | 347 | printk("vm_start %lx prev %lx\n", vma->vm_start, prev); |
312 | if (vma->vm_start < pend) | 348 | bug = 1; |
349 | } | ||
350 | if (vma->vm_start < pend) { | ||
313 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); | 351 | printk("vm_start %lx pend %lx\n", vma->vm_start, pend); |
314 | if (vma->vm_start > vma->vm_end) | 352 | bug = 1; |
315 | printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); | 353 | } |
354 | if (vma->vm_start > vma->vm_end) { | ||
355 | printk("vm_end %lx < vm_start %lx\n", | ||
356 | vma->vm_end, vma->vm_start); | ||
357 | bug = 1; | ||
358 | } | ||
359 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { | ||
360 | printk("free gap %lx, correct %lx\n", | ||
361 | vma->rb_subtree_gap, | ||
362 | vma_compute_subtree_gap(vma)); | ||
363 | bug = 1; | ||
364 | } | ||
316 | i++; | 365 | i++; |
317 | pn = nd; | 366 | pn = nd; |
318 | prev = vma->vm_start; | 367 | prev = vma->vm_start; |
319 | pend = vma->vm_end; | 368 | pend = vma->vm_end; |
320 | } | 369 | } |
321 | j = 0; | 370 | j = 0; |
322 | for (nd = pn; nd; nd = rb_prev(nd)) { | 371 | for (nd = pn; nd; nd = rb_prev(nd)) |
323 | j++; | 372 | j++; |
373 | if (i != j) { | ||
374 | printk("backwards %d, forwards %d\n", j, i); | ||
375 | bug = 1; | ||
376 | } | ||
377 | return bug ? -1 : i; | ||
378 | } | ||
379 | |||
380 | static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) | ||
381 | { | ||
382 | struct rb_node *nd; | ||
383 | |||
384 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | ||
385 | struct vm_area_struct *vma; | ||
386 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | ||
387 | BUG_ON(vma != ignore && | ||
388 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); | ||
324 | } | 389 | } |
325 | if (i != j) | ||
326 | printk("backwards %d, forwards %d\n", j, i), i = 0; | ||
327 | return i; | ||
328 | } | 390 | } |
329 | 391 | ||
330 | void validate_mm(struct mm_struct *mm) | 392 | void validate_mm(struct mm_struct *mm) |
331 | { | 393 | { |
332 | int bug = 0; | 394 | int bug = 0; |
333 | int i = 0; | 395 | int i = 0; |
396 | unsigned long highest_address = 0; | ||
334 | struct vm_area_struct *vma = mm->mmap; | 397 | struct vm_area_struct *vma = mm->mmap; |
335 | while (vma) { | 398 | while (vma) { |
336 | struct anon_vma_chain *avc; | 399 | struct anon_vma_chain *avc; |
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm) | |||
338 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 401 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
339 | anon_vma_interval_tree_verify(avc); | 402 | anon_vma_interval_tree_verify(avc); |
340 | vma_unlock_anon_vma(vma); | 403 | vma_unlock_anon_vma(vma); |
404 | highest_address = vma->vm_end; | ||
341 | vma = vma->vm_next; | 405 | vma = vma->vm_next; |
342 | i++; | 406 | i++; |
343 | } | 407 | } |
344 | if (i != mm->map_count) | 408 | if (i != mm->map_count) { |
345 | printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; | 409 | printk("map_count %d vm_next %d\n", mm->map_count, i); |
410 | bug = 1; | ||
411 | } | ||
412 | if (highest_address != mm->highest_vm_end) { | ||
413 | printk("mm->highest_vm_end %lx, found %lx\n", | ||
414 | mm->highest_vm_end, highest_address); | ||
415 | bug = 1; | ||
416 | } | ||
346 | i = browse_rb(&mm->mm_rb); | 417 | i = browse_rb(&mm->mm_rb); |
347 | if (i != mm->map_count) | 418 | if (i != mm->map_count) { |
348 | printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; | 419 | printk("map_count %d rb %d\n", mm->map_count, i); |
420 | bug = 1; | ||
421 | } | ||
349 | BUG_ON(bug); | 422 | BUG_ON(bug); |
350 | } | 423 | } |
351 | #else | 424 | #else |
425 | #define validate_mm_rb(root, ignore) do { } while (0) | ||
352 | #define validate_mm(mm) do { } while (0) | 426 | #define validate_mm(mm) do { } while (0) |
353 | #endif | 427 | #endif |
354 | 428 | ||
429 | RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, | ||
430 | unsigned long, rb_subtree_gap, vma_compute_subtree_gap) | ||
431 | |||
432 | /* | ||
433 | * Update augmented rbtree rb_subtree_gap values after vma->vm_start or | ||
434 | * vma->vm_prev->vm_end values changed, without modifying the vma's position | ||
435 | * in the rbtree. | ||
436 | */ | ||
437 | static void vma_gap_update(struct vm_area_struct *vma) | ||
438 | { | ||
439 | /* | ||
440 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback | ||
441 | * function that does exacltly what we want. | ||
442 | */ | ||
443 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); | ||
444 | } | ||
445 | |||
446 | static inline void vma_rb_insert(struct vm_area_struct *vma, | ||
447 | struct rb_root *root) | ||
448 | { | ||
449 | /* All rb_subtree_gap values must be consistent prior to insertion */ | ||
450 | validate_mm_rb(root, NULL); | ||
451 | |||
452 | rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | ||
453 | } | ||
454 | |||
455 | static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) | ||
456 | { | ||
457 | /* | ||
458 | * All rb_subtree_gap values must be consistent prior to erase, | ||
459 | * with the possible exception of the vma being erased. | ||
460 | */ | ||
461 | validate_mm_rb(root, vma); | ||
462 | |||
463 | /* | ||
464 | * Note rb_erase_augmented is a fairly large inline function, | ||
465 | * so make sure we instantiate it only once with our desired | ||
466 | * augmented rbtree callbacks. | ||
467 | */ | ||
468 | rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); | ||
469 | } | ||
470 | |||
355 | /* | 471 | /* |
356 | * vma has some anon_vma assigned, and is already inserted on that | 472 | * vma has some anon_vma assigned, and is already inserted on that |
357 | * anon_vma's interval trees. | 473 | * anon_vma's interval trees. |
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, | |||
421 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 537 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
422 | struct rb_node **rb_link, struct rb_node *rb_parent) | 538 | struct rb_node **rb_link, struct rb_node *rb_parent) |
423 | { | 539 | { |
540 | /* Update tracking information for the gap following the new vma. */ | ||
541 | if (vma->vm_next) | ||
542 | vma_gap_update(vma->vm_next); | ||
543 | else | ||
544 | mm->highest_vm_end = vma->vm_end; | ||
545 | |||
546 | /* | ||
547 | * vma->vm_prev wasn't known when we followed the rbtree to find the | ||
548 | * correct insertion point for that vma. As a result, we could not | ||
549 | * update the vma vm_rb parents rb_subtree_gap values on the way down. | ||
550 | * So, we first insert the vma with a zero rb_subtree_gap value | ||
551 | * (to be consistent with what we did on the way down), and then | ||
552 | * immediately update the gap to the correct value. Finally we | ||
553 | * rebalance the rbtree after all augmented values have been set. | ||
554 | */ | ||
424 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); | 555 | rb_link_node(&vma->vm_rb, rb_parent, rb_link); |
425 | rb_insert_color(&vma->vm_rb, &mm->mm_rb); | 556 | vma->rb_subtree_gap = 0; |
557 | vma_gap_update(vma); | ||
558 | vma_rb_insert(vma, &mm->mm_rb); | ||
426 | } | 559 | } |
427 | 560 | ||
428 | static void __vma_link_file(struct vm_area_struct *vma) | 561 | static void __vma_link_file(struct vm_area_struct *vma) |
@@ -498,12 +631,12 @@ static inline void | |||
498 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | 631 | __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, |
499 | struct vm_area_struct *prev) | 632 | struct vm_area_struct *prev) |
500 | { | 633 | { |
501 | struct vm_area_struct *next = vma->vm_next; | 634 | struct vm_area_struct *next; |
502 | 635 | ||
503 | prev->vm_next = next; | 636 | vma_rb_erase(vma, &mm->mm_rb); |
637 | prev->vm_next = next = vma->vm_next; | ||
504 | if (next) | 638 | if (next) |
505 | next->vm_prev = prev; | 639 | next->vm_prev = prev; |
506 | rb_erase(&vma->vm_rb, &mm->mm_rb); | ||
507 | if (mm->mmap_cache == vma) | 640 | if (mm->mmap_cache == vma) |
508 | mm->mmap_cache = prev; | 641 | mm->mmap_cache = prev; |
509 | } | 642 | } |
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
525 | struct rb_root *root = NULL; | 658 | struct rb_root *root = NULL; |
526 | struct anon_vma *anon_vma = NULL; | 659 | struct anon_vma *anon_vma = NULL; |
527 | struct file *file = vma->vm_file; | 660 | struct file *file = vma->vm_file; |
661 | bool start_changed = false, end_changed = false; | ||
528 | long adjust_next = 0; | 662 | long adjust_next = 0; |
529 | int remove_next = 0; | 663 | int remove_next = 0; |
530 | 664 | ||
@@ -602,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
602 | if (anon_vma) { | 736 | if (anon_vma) { |
603 | VM_BUG_ON(adjust_next && next->anon_vma && | 737 | VM_BUG_ON(adjust_next && next->anon_vma && |
604 | anon_vma != next->anon_vma); | 738 | anon_vma != next->anon_vma); |
605 | anon_vma_lock(anon_vma); | 739 | anon_vma_lock_write(anon_vma); |
606 | anon_vma_interval_tree_pre_update_vma(vma); | 740 | anon_vma_interval_tree_pre_update_vma(vma); |
607 | if (adjust_next) | 741 | if (adjust_next) |
608 | anon_vma_interval_tree_pre_update_vma(next); | 742 | anon_vma_interval_tree_pre_update_vma(next); |
@@ -615,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end); | |||
615 | vma_interval_tree_remove(next, root); | 749 | vma_interval_tree_remove(next, root); |
616 | } | 750 | } |
617 | 751 | ||
618 | vma->vm_start = start; | 752 | if (start != vma->vm_start) { |
619 | vma->vm_end = end; | 753 | vma->vm_start = start; |
754 | start_changed = true; | ||
755 | } | ||
756 | if (end != vma->vm_end) { | ||
757 | vma->vm_end = end; | ||
758 | end_changed = true; | ||
759 | } | ||
620 | vma->vm_pgoff = pgoff; | 760 | vma->vm_pgoff = pgoff; |
621 | if (adjust_next) { | 761 | if (adjust_next) { |
622 | next->vm_start += adjust_next << PAGE_SHIFT; | 762 | next->vm_start += adjust_next << PAGE_SHIFT; |
@@ -645,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end); | |||
645 | * (it may either follow vma or precede it). | 785 | * (it may either follow vma or precede it). |
646 | */ | 786 | */ |
647 | __insert_vm_struct(mm, insert); | 787 | __insert_vm_struct(mm, insert); |
788 | } else { | ||
789 | if (start_changed) | ||
790 | vma_gap_update(vma); | ||
791 | if (end_changed) { | ||
792 | if (!next) | ||
793 | mm->highest_vm_end = end; | ||
794 | else if (!adjust_next) | ||
795 | vma_gap_update(next); | ||
796 | } | ||
648 | } | 797 | } |
649 | 798 | ||
650 | if (anon_vma) { | 799 | if (anon_vma) { |
@@ -678,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end); | |||
678 | * we must remove another next too. It would clutter | 827 | * we must remove another next too. It would clutter |
679 | * up the code too much to do both in one go. | 828 | * up the code too much to do both in one go. |
680 | */ | 829 | */ |
681 | if (remove_next == 2) { | 830 | next = vma->vm_next; |
682 | next = vma->vm_next; | 831 | if (remove_next == 2) |
683 | goto again; | 832 | goto again; |
684 | } | 833 | else if (next) |
834 | vma_gap_update(next); | ||
835 | else | ||
836 | mm->highest_vm_end = end; | ||
685 | } | 837 | } |
686 | if (insert && file) | 838 | if (insert && file) |
687 | uprobe_mmap(insert); | 839 | uprobe_mmap(insert); |
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1153 | * memory so no accounting is necessary | 1305 | * memory so no accounting is necessary |
1154 | */ | 1306 | */ |
1155 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, | 1307 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, |
1156 | VM_NORESERVE, &user, | 1308 | VM_NORESERVE, |
1157 | HUGETLB_ANONHUGE_INODE); | 1309 | &user, HUGETLB_ANONHUGE_INODE, |
1310 | (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); | ||
1158 | if (IS_ERR(file)) | 1311 | if (IS_ERR(file)) |
1159 | return PTR_ERR(file); | 1312 | return PTR_ERR(file); |
1160 | } | 1313 | } |
@@ -1335,7 +1488,11 @@ munmap_back: | |||
1335 | * | 1488 | * |
1336 | * Answer: Yes, several device drivers can do it in their | 1489 | * Answer: Yes, several device drivers can do it in their |
1337 | * f_op->mmap method. -DaveM | 1490 | * f_op->mmap method. -DaveM |
1491 | * Bug: If addr is changed, prev, rb_link, rb_parent should | ||
1492 | * be updated for vma_link() | ||
1338 | */ | 1493 | */ |
1494 | WARN_ON_ONCE(addr != vma->vm_start); | ||
1495 | |||
1339 | addr = vma->vm_start; | 1496 | addr = vma->vm_start; |
1340 | pgoff = vma->vm_pgoff; | 1497 | pgoff = vma->vm_pgoff; |
1341 | vm_flags = vma->vm_flags; | 1498 | vm_flags = vma->vm_flags; |
@@ -1400,6 +1557,206 @@ unacct_error: | |||
1400 | return error; | 1557 | return error; |
1401 | } | 1558 | } |
1402 | 1559 | ||
1560 | unsigned long unmapped_area(struct vm_unmapped_area_info *info) | ||
1561 | { | ||
1562 | /* | ||
1563 | * We implement the search by looking for an rbtree node that | ||
1564 | * immediately follows a suitable gap. That is, | ||
1565 | * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; | ||
1566 | * - gap_end = vma->vm_start >= info->low_limit + length; | ||
1567 | * - gap_end - gap_start >= length | ||
1568 | */ | ||
1569 | |||
1570 | struct mm_struct *mm = current->mm; | ||
1571 | struct vm_area_struct *vma; | ||
1572 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | ||
1573 | |||
1574 | /* Adjust search length to account for worst case alignment overhead */ | ||
1575 | length = info->length + info->align_mask; | ||
1576 | if (length < info->length) | ||
1577 | return -ENOMEM; | ||
1578 | |||
1579 | /* Adjust search limits by the desired length */ | ||
1580 | if (info->high_limit < length) | ||
1581 | return -ENOMEM; | ||
1582 | high_limit = info->high_limit - length; | ||
1583 | |||
1584 | if (info->low_limit > high_limit) | ||
1585 | return -ENOMEM; | ||
1586 | low_limit = info->low_limit + length; | ||
1587 | |||
1588 | /* Check if rbtree root looks promising */ | ||
1589 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | ||
1590 | goto check_highest; | ||
1591 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | ||
1592 | if (vma->rb_subtree_gap < length) | ||
1593 | goto check_highest; | ||
1594 | |||
1595 | while (true) { | ||
1596 | /* Visit left subtree if it looks promising */ | ||
1597 | gap_end = vma->vm_start; | ||
1598 | if (gap_end >= low_limit && vma->vm_rb.rb_left) { | ||
1599 | struct vm_area_struct *left = | ||
1600 | rb_entry(vma->vm_rb.rb_left, | ||
1601 | struct vm_area_struct, vm_rb); | ||
1602 | if (left->rb_subtree_gap >= length) { | ||
1603 | vma = left; | ||
1604 | continue; | ||
1605 | } | ||
1606 | } | ||
1607 | |||
1608 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | ||
1609 | check_current: | ||
1610 | /* Check if current node has a suitable gap */ | ||
1611 | if (gap_start > high_limit) | ||
1612 | return -ENOMEM; | ||
1613 | if (gap_end >= low_limit && gap_end - gap_start >= length) | ||
1614 | goto found; | ||
1615 | |||
1616 | /* Visit right subtree if it looks promising */ | ||
1617 | if (vma->vm_rb.rb_right) { | ||
1618 | struct vm_area_struct *right = | ||
1619 | rb_entry(vma->vm_rb.rb_right, | ||
1620 | struct vm_area_struct, vm_rb); | ||
1621 | if (right->rb_subtree_gap >= length) { | ||
1622 | vma = right; | ||
1623 | continue; | ||
1624 | } | ||
1625 | } | ||
1626 | |||
1627 | /* Go back up the rbtree to find next candidate node */ | ||
1628 | while (true) { | ||
1629 | struct rb_node *prev = &vma->vm_rb; | ||
1630 | if (!rb_parent(prev)) | ||
1631 | goto check_highest; | ||
1632 | vma = rb_entry(rb_parent(prev), | ||
1633 | struct vm_area_struct, vm_rb); | ||
1634 | if (prev == vma->vm_rb.rb_left) { | ||
1635 | gap_start = vma->vm_prev->vm_end; | ||
1636 | gap_end = vma->vm_start; | ||
1637 | goto check_current; | ||
1638 | } | ||
1639 | } | ||
1640 | } | ||
1641 | |||
1642 | check_highest: | ||
1643 | /* Check highest gap, which does not precede any rbtree node */ | ||
1644 | gap_start = mm->highest_vm_end; | ||
1645 | gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ | ||
1646 | if (gap_start > high_limit) | ||
1647 | return -ENOMEM; | ||
1648 | |||
1649 | found: | ||
1650 | /* We found a suitable gap. Clip it with the original low_limit. */ | ||
1651 | if (gap_start < info->low_limit) | ||
1652 | gap_start = info->low_limit; | ||
1653 | |||
1654 | /* Adjust gap address to the desired alignment */ | ||
1655 | gap_start += (info->align_offset - gap_start) & info->align_mask; | ||
1656 | |||
1657 | VM_BUG_ON(gap_start + info->length > info->high_limit); | ||
1658 | VM_BUG_ON(gap_start + info->length > gap_end); | ||
1659 | return gap_start; | ||
1660 | } | ||
1661 | |||
1662 | unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) | ||
1663 | { | ||
1664 | struct mm_struct *mm = current->mm; | ||
1665 | struct vm_area_struct *vma; | ||
1666 | unsigned long length, low_limit, high_limit, gap_start, gap_end; | ||
1667 | |||
1668 | /* Adjust search length to account for worst case alignment overhead */ | ||
1669 | length = info->length + info->align_mask; | ||
1670 | if (length < info->length) | ||
1671 | return -ENOMEM; | ||
1672 | |||
1673 | /* | ||
1674 | * Adjust search limits by the desired length. | ||
1675 | * See implementation comment at top of unmapped_area(). | ||
1676 | */ | ||
1677 | gap_end = info->high_limit; | ||
1678 | if (gap_end < length) | ||
1679 | return -ENOMEM; | ||
1680 | high_limit = gap_end - length; | ||
1681 | |||
1682 | if (info->low_limit > high_limit) | ||
1683 | return -ENOMEM; | ||
1684 | low_limit = info->low_limit + length; | ||
1685 | |||
1686 | /* Check highest gap, which does not precede any rbtree node */ | ||
1687 | gap_start = mm->highest_vm_end; | ||
1688 | if (gap_start <= high_limit) | ||
1689 | goto found_highest; | ||
1690 | |||
1691 | /* Check if rbtree root looks promising */ | ||
1692 | if (RB_EMPTY_ROOT(&mm->mm_rb)) | ||
1693 | return -ENOMEM; | ||
1694 | vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); | ||
1695 | if (vma->rb_subtree_gap < length) | ||
1696 | return -ENOMEM; | ||
1697 | |||
1698 | while (true) { | ||
1699 | /* Visit right subtree if it looks promising */ | ||
1700 | gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; | ||
1701 | if (gap_start <= high_limit && vma->vm_rb.rb_right) { | ||
1702 | struct vm_area_struct *right = | ||
1703 | rb_entry(vma->vm_rb.rb_right, | ||
1704 | struct vm_area_struct, vm_rb); | ||
1705 | if (right->rb_subtree_gap >= length) { | ||
1706 | vma = right; | ||
1707 | continue; | ||
1708 | } | ||
1709 | } | ||
1710 | |||
1711 | check_current: | ||
1712 | /* Check if current node has a suitable gap */ | ||
1713 | gap_end = vma->vm_start; | ||
1714 | if (gap_end < low_limit) | ||
1715 | return -ENOMEM; | ||
1716 | if (gap_start <= high_limit && gap_end - gap_start >= length) | ||
1717 | goto found; | ||
1718 | |||
1719 | /* Visit left subtree if it looks promising */ | ||
1720 | if (vma->vm_rb.rb_left) { | ||
1721 | struct vm_area_struct *left = | ||
1722 | rb_entry(vma->vm_rb.rb_left, | ||
1723 | struct vm_area_struct, vm_rb); | ||
1724 | if (left->rb_subtree_gap >= length) { | ||
1725 | vma = left; | ||
1726 | continue; | ||
1727 | } | ||
1728 | } | ||
1729 | |||
1730 | /* Go back up the rbtree to find next candidate node */ | ||
1731 | while (true) { | ||
1732 | struct rb_node *prev = &vma->vm_rb; | ||
1733 | if (!rb_parent(prev)) | ||
1734 | return -ENOMEM; | ||
1735 | vma = rb_entry(rb_parent(prev), | ||
1736 | struct vm_area_struct, vm_rb); | ||
1737 | if (prev == vma->vm_rb.rb_right) { | ||
1738 | gap_start = vma->vm_prev ? | ||
1739 | vma->vm_prev->vm_end : 0; | ||
1740 | goto check_current; | ||
1741 | } | ||
1742 | } | ||
1743 | } | ||
1744 | |||
1745 | found: | ||
1746 | /* We found a suitable gap. Clip it with the original high_limit. */ | ||
1747 | if (gap_end > info->high_limit) | ||
1748 | gap_end = info->high_limit; | ||
1749 | |||
1750 | found_highest: | ||
1751 | /* Compute highest gap address at the desired alignment */ | ||
1752 | gap_end -= info->length; | ||
1753 | gap_end -= (gap_end - info->align_offset) & info->align_mask; | ||
1754 | |||
1755 | VM_BUG_ON(gap_end < info->low_limit); | ||
1756 | VM_BUG_ON(gap_end < gap_start); | ||
1757 | return gap_end; | ||
1758 | } | ||
1759 | |||
1403 | /* Get an address range which is currently unmapped. | 1760 | /* Get an address range which is currently unmapped. |
1404 | * For shmat() with addr=0. | 1761 | * For shmat() with addr=0. |
1405 | * | 1762 | * |
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1418 | { | 1775 | { |
1419 | struct mm_struct *mm = current->mm; | 1776 | struct mm_struct *mm = current->mm; |
1420 | struct vm_area_struct *vma; | 1777 | struct vm_area_struct *vma; |
1421 | unsigned long start_addr; | 1778 | struct vm_unmapped_area_info info; |
1422 | 1779 | ||
1423 | if (len > TASK_SIZE) | 1780 | if (len > TASK_SIZE) |
1424 | return -ENOMEM; | 1781 | return -ENOMEM; |
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1433 | (!vma || addr + len <= vma->vm_start)) | 1790 | (!vma || addr + len <= vma->vm_start)) |
1434 | return addr; | 1791 | return addr; |
1435 | } | 1792 | } |
1436 | if (len > mm->cached_hole_size) { | ||
1437 | start_addr = addr = mm->free_area_cache; | ||
1438 | } else { | ||
1439 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
1440 | mm->cached_hole_size = 0; | ||
1441 | } | ||
1442 | 1793 | ||
1443 | full_search: | 1794 | info.flags = 0; |
1444 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | 1795 | info.length = len; |
1445 | /* At this point: (!vma || addr < vma->vm_end). */ | 1796 | info.low_limit = TASK_UNMAPPED_BASE; |
1446 | if (TASK_SIZE - len < addr) { | 1797 | info.high_limit = TASK_SIZE; |
1447 | /* | 1798 | info.align_mask = 0; |
1448 | * Start a new search - just in case we missed | 1799 | return vm_unmapped_area(&info); |
1449 | * some holes. | ||
1450 | */ | ||
1451 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
1452 | addr = TASK_UNMAPPED_BASE; | ||
1453 | start_addr = addr; | ||
1454 | mm->cached_hole_size = 0; | ||
1455 | goto full_search; | ||
1456 | } | ||
1457 | return -ENOMEM; | ||
1458 | } | ||
1459 | if (!vma || addr + len <= vma->vm_start) { | ||
1460 | /* | ||
1461 | * Remember the place where we stopped the search: | ||
1462 | */ | ||
1463 | mm->free_area_cache = addr + len; | ||
1464 | return addr; | ||
1465 | } | ||
1466 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1467 | mm->cached_hole_size = vma->vm_start - addr; | ||
1468 | addr = vma->vm_end; | ||
1469 | } | ||
1470 | } | 1800 | } |
1471 | #endif | 1801 | #endif |
1472 | 1802 | ||
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1491 | { | 1821 | { |
1492 | struct vm_area_struct *vma; | 1822 | struct vm_area_struct *vma; |
1493 | struct mm_struct *mm = current->mm; | 1823 | struct mm_struct *mm = current->mm; |
1494 | unsigned long addr = addr0, start_addr; | 1824 | unsigned long addr = addr0; |
1825 | struct vm_unmapped_area_info info; | ||
1495 | 1826 | ||
1496 | /* requested length too big for entire address space */ | 1827 | /* requested length too big for entire address space */ |
1497 | if (len > TASK_SIZE) | 1828 | if (len > TASK_SIZE) |
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1509 | return addr; | 1840 | return addr; |
1510 | } | 1841 | } |
1511 | 1842 | ||
1512 | /* check if free_area_cache is useful for us */ | 1843 | info.flags = VM_UNMAPPED_AREA_TOPDOWN; |
1513 | if (len <= mm->cached_hole_size) { | 1844 | info.length = len; |
1514 | mm->cached_hole_size = 0; | 1845 | info.low_limit = PAGE_SIZE; |
1515 | mm->free_area_cache = mm->mmap_base; | 1846 | info.high_limit = mm->mmap_base; |
1516 | } | 1847 | info.align_mask = 0; |
1517 | 1848 | addr = vm_unmapped_area(&info); | |
1518 | try_again: | ||
1519 | /* either no address requested or can't fit in requested address hole */ | ||
1520 | start_addr = addr = mm->free_area_cache; | ||
1521 | |||
1522 | if (addr < len) | ||
1523 | goto fail; | ||
1524 | |||
1525 | addr -= len; | ||
1526 | do { | ||
1527 | /* | ||
1528 | * Lookup failure means no vma is above this address, | ||
1529 | * else if new region fits below vma->vm_start, | ||
1530 | * return with success: | ||
1531 | */ | ||
1532 | vma = find_vma(mm, addr); | ||
1533 | if (!vma || addr+len <= vma->vm_start) | ||
1534 | /* remember the address as a hint for next time */ | ||
1535 | return (mm->free_area_cache = addr); | ||
1536 | |||
1537 | /* remember the largest hole we saw so far */ | ||
1538 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1539 | mm->cached_hole_size = vma->vm_start - addr; | ||
1540 | |||
1541 | /* try just below the current vma->vm_start */ | ||
1542 | addr = vma->vm_start-len; | ||
1543 | } while (len < vma->vm_start); | ||
1544 | |||
1545 | fail: | ||
1546 | /* | ||
1547 | * if hint left us with no space for the requested | ||
1548 | * mapping then try again: | ||
1549 | * | ||
1550 | * Note: this is different with the case of bottomup | ||
1551 | * which does the fully line-search, but we use find_vma | ||
1552 | * here that causes some holes skipped. | ||
1553 | */ | ||
1554 | if (start_addr != mm->mmap_base) { | ||
1555 | mm->free_area_cache = mm->mmap_base; | ||
1556 | mm->cached_hole_size = 0; | ||
1557 | goto try_again; | ||
1558 | } | ||
1559 | 1849 | ||
1560 | /* | 1850 | /* |
1561 | * A failed mmap() very likely causes application failure, | 1851 | * A failed mmap() very likely causes application failure, |
@@ -1563,14 +1853,13 @@ fail: | |||
1563 | * can happen with large stack limits and large mmap() | 1853 | * can happen with large stack limits and large mmap() |
1564 | * allocations. | 1854 | * allocations. |
1565 | */ | 1855 | */ |
1566 | mm->cached_hole_size = ~0UL; | 1856 | if (addr & ~PAGE_MASK) { |
1567 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 1857 | VM_BUG_ON(addr != -ENOMEM); |
1568 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 1858 | info.flags = 0; |
1569 | /* | 1859 | info.low_limit = TASK_UNMAPPED_BASE; |
1570 | * Restore the topdown base: | 1860 | info.high_limit = TASK_SIZE; |
1571 | */ | 1861 | addr = vm_unmapped_area(&info); |
1572 | mm->free_area_cache = mm->mmap_base; | 1862 | } |
1573 | mm->cached_hole_size = ~0UL; | ||
1574 | 1863 | ||
1575 | return addr; | 1864 | return addr; |
1576 | } | 1865 | } |
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1780 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 2069 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
1781 | error = acct_stack_growth(vma, size, grow); | 2070 | error = acct_stack_growth(vma, size, grow); |
1782 | if (!error) { | 2071 | if (!error) { |
2072 | /* | ||
2073 | * vma_gap_update() doesn't support concurrent | ||
2074 | * updates, but we only hold a shared mmap_sem | ||
2075 | * lock here, so we need to protect against | ||
2076 | * concurrent vma expansions. | ||
2077 | * vma_lock_anon_vma() doesn't help here, as | ||
2078 | * we don't guarantee that all growable vmas | ||
2079 | * in a mm share the same root anon vma. | ||
2080 | * So, we reuse mm->page_table_lock to guard | ||
2081 | * against concurrent vma expansions. | ||
2082 | */ | ||
2083 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1783 | anon_vma_interval_tree_pre_update_vma(vma); | 2084 | anon_vma_interval_tree_pre_update_vma(vma); |
1784 | vma->vm_end = address; | 2085 | vma->vm_end = address; |
1785 | anon_vma_interval_tree_post_update_vma(vma); | 2086 | anon_vma_interval_tree_post_update_vma(vma); |
2087 | if (vma->vm_next) | ||
2088 | vma_gap_update(vma->vm_next); | ||
2089 | else | ||
2090 | vma->vm_mm->highest_vm_end = address; | ||
2091 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
2092 | |||
1786 | perf_event_mmap(vma); | 2093 | perf_event_mmap(vma); |
1787 | } | 2094 | } |
1788 | } | 2095 | } |
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1833 | if (grow <= vma->vm_pgoff) { | 2140 | if (grow <= vma->vm_pgoff) { |
1834 | error = acct_stack_growth(vma, size, grow); | 2141 | error = acct_stack_growth(vma, size, grow); |
1835 | if (!error) { | 2142 | if (!error) { |
2143 | /* | ||
2144 | * vma_gap_update() doesn't support concurrent | ||
2145 | * updates, but we only hold a shared mmap_sem | ||
2146 | * lock here, so we need to protect against | ||
2147 | * concurrent vma expansions. | ||
2148 | * vma_lock_anon_vma() doesn't help here, as | ||
2149 | * we don't guarantee that all growable vmas | ||
2150 | * in a mm share the same root anon vma. | ||
2151 | * So, we reuse mm->page_table_lock to guard | ||
2152 | * against concurrent vma expansions. | ||
2153 | */ | ||
2154 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1836 | anon_vma_interval_tree_pre_update_vma(vma); | 2155 | anon_vma_interval_tree_pre_update_vma(vma); |
1837 | vma->vm_start = address; | 2156 | vma->vm_start = address; |
1838 | vma->vm_pgoff -= grow; | 2157 | vma->vm_pgoff -= grow; |
1839 | anon_vma_interval_tree_post_update_vma(vma); | 2158 | anon_vma_interval_tree_post_update_vma(vma); |
2159 | vma_gap_update(vma); | ||
2160 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
2161 | |||
1840 | perf_event_mmap(vma); | 2162 | perf_event_mmap(vma); |
1841 | } | 2163 | } |
1842 | } | 2164 | } |
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1959 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 2281 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
1960 | vma->vm_prev = NULL; | 2282 | vma->vm_prev = NULL; |
1961 | do { | 2283 | do { |
1962 | rb_erase(&vma->vm_rb, &mm->mm_rb); | 2284 | vma_rb_erase(vma, &mm->mm_rb); |
1963 | mm->map_count--; | 2285 | mm->map_count--; |
1964 | tail_vma = vma; | 2286 | tail_vma = vma; |
1965 | vma = vma->vm_next; | 2287 | vma = vma->vm_next; |
1966 | } while (vma && vma->vm_start < end); | 2288 | } while (vma && vma->vm_start < end); |
1967 | *insertion_point = vma; | 2289 | *insertion_point = vma; |
1968 | if (vma) | 2290 | if (vma) { |
1969 | vma->vm_prev = prev; | 2291 | vma->vm_prev = prev; |
2292 | vma_gap_update(vma); | ||
2293 | } else | ||
2294 | mm->highest_vm_end = prev ? prev->vm_end : 0; | ||
1970 | tail_vma->vm_next = NULL; | 2295 | tail_vma->vm_next = NULL; |
1971 | if (mm->unmap_area == arch_unmap_area) | 2296 | if (mm->unmap_area == arch_unmap_area) |
1972 | addr = prev ? prev->vm_end : mm->mmap_base; | 2297 | addr = prev ? prev->vm_end : mm->mmap_base; |
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2561 | * The LSB of head.next can't change from under us | 2886 | * The LSB of head.next can't change from under us |
2562 | * because we hold the mm_all_locks_mutex. | 2887 | * because we hold the mm_all_locks_mutex. |
2563 | */ | 2888 | */ |
2564 | mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); | 2889 | down_write(&anon_vma->root->rwsem); |
2565 | /* | 2890 | /* |
2566 | * We can safely modify head.next after taking the | 2891 | * We can safely modify head.next after taking the |
2567 | * anon_vma->root->mutex. If some other vma in this mm shares | 2892 | * anon_vma->root->rwsem. If some other vma in this mm shares |
2568 | * the same anon_vma we won't take it again. | 2893 | * the same anon_vma we won't take it again. |
2569 | * | 2894 | * |
2570 | * No need of atomic instructions here, head.next | 2895 | * No need of atomic instructions here, head.next |
2571 | * can't change from under us thanks to the | 2896 | * can't change from under us thanks to the |
2572 | * anon_vma->root->mutex. | 2897 | * anon_vma->root->rwsem. |
2573 | */ | 2898 | */ |
2574 | if (__test_and_set_bit(0, (unsigned long *) | 2899 | if (__test_and_set_bit(0, (unsigned long *) |
2575 | &anon_vma->root->rb_root.rb_node)) | 2900 | &anon_vma->root->rb_root.rb_node)) |
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2671 | * | 2996 | * |
2672 | * No need of atomic instructions here, head.next | 2997 | * No need of atomic instructions here, head.next |
2673 | * can't change from under us until we release the | 2998 | * can't change from under us until we release the |
2674 | * anon_vma->root->mutex. | 2999 | * anon_vma->root->rwsem. |
2675 | */ | 3000 | */ |
2676 | if (!__test_and_clear_bit(0, (unsigned long *) | 3001 | if (!__test_and_clear_bit(0, (unsigned long *) |
2677 | &anon_vma->root->rb_root.rb_node)) | 3002 | &anon_vma->root->rb_root.rb_node)) |
diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab6..94722a4d6b43 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
35 | } | 35 | } |
36 | #endif | 36 | #endif |
37 | 37 | ||
38 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
39 | unsigned long addr, unsigned long end, pgprot_t newprot, | 39 | unsigned long addr, unsigned long end, pgprot_t newprot, |
40 | int dirty_accountable) | 40 | int dirty_accountable, int prot_numa, bool *ret_all_same_node) |
41 | { | 41 | { |
42 | struct mm_struct *mm = vma->vm_mm; | ||
42 | pte_t *pte, oldpte; | 43 | pte_t *pte, oldpte; |
43 | spinlock_t *ptl; | 44 | spinlock_t *ptl; |
45 | unsigned long pages = 0; | ||
46 | bool all_same_node = true; | ||
47 | int last_nid = -1; | ||
44 | 48 | ||
45 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 49 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
46 | arch_enter_lazy_mmu_mode(); | 50 | arch_enter_lazy_mmu_mode(); |
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
48 | oldpte = *pte; | 52 | oldpte = *pte; |
49 | if (pte_present(oldpte)) { | 53 | if (pte_present(oldpte)) { |
50 | pte_t ptent; | 54 | pte_t ptent; |
55 | bool updated = false; | ||
51 | 56 | ||
52 | ptent = ptep_modify_prot_start(mm, addr, pte); | 57 | ptent = ptep_modify_prot_start(mm, addr, pte); |
53 | ptent = pte_modify(ptent, newprot); | 58 | if (!prot_numa) { |
59 | ptent = pte_modify(ptent, newprot); | ||
60 | updated = true; | ||
61 | } else { | ||
62 | struct page *page; | ||
63 | |||
64 | page = vm_normal_page(vma, addr, oldpte); | ||
65 | if (page) { | ||
66 | int this_nid = page_to_nid(page); | ||
67 | if (last_nid == -1) | ||
68 | last_nid = this_nid; | ||
69 | if (last_nid != this_nid) | ||
70 | all_same_node = false; | ||
71 | |||
72 | /* only check non-shared pages */ | ||
73 | if (!pte_numa(oldpte) && | ||
74 | page_mapcount(page) == 1) { | ||
75 | ptent = pte_mknuma(ptent); | ||
76 | updated = true; | ||
77 | } | ||
78 | } | ||
79 | } | ||
54 | 80 | ||
55 | /* | 81 | /* |
56 | * Avoid taking write faults for pages we know to be | 82 | * Avoid taking write faults for pages we know to be |
57 | * dirty. | 83 | * dirty. |
58 | */ | 84 | */ |
59 | if (dirty_accountable && pte_dirty(ptent)) | 85 | if (dirty_accountable && pte_dirty(ptent)) { |
60 | ptent = pte_mkwrite(ptent); | 86 | ptent = pte_mkwrite(ptent); |
87 | updated = true; | ||
88 | } | ||
61 | 89 | ||
90 | if (updated) | ||
91 | pages++; | ||
62 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 92 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
63 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 93 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 94 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
72 | set_pte_at(mm, addr, pte, | 102 | set_pte_at(mm, addr, pte, |
73 | swp_entry_to_pte(entry)); | 103 | swp_entry_to_pte(entry)); |
74 | } | 104 | } |
105 | pages++; | ||
75 | } | 106 | } |
76 | } while (pte++, addr += PAGE_SIZE, addr != end); | 107 | } while (pte++, addr += PAGE_SIZE, addr != end); |
77 | arch_leave_lazy_mmu_mode(); | 108 | arch_leave_lazy_mmu_mode(); |
78 | pte_unmap_unlock(pte - 1, ptl); | 109 | pte_unmap_unlock(pte - 1, ptl); |
110 | |||
111 | *ret_all_same_node = all_same_node; | ||
112 | return pages; | ||
79 | } | 113 | } |
80 | 114 | ||
81 | static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 115 | #ifdef CONFIG_NUMA_BALANCING |
82 | unsigned long addr, unsigned long end, pgprot_t newprot, | 116 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, |
83 | int dirty_accountable) | 117 | pmd_t *pmd) |
118 | { | ||
119 | spin_lock(&mm->page_table_lock); | ||
120 | set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); | ||
121 | spin_unlock(&mm->page_table_lock); | ||
122 | } | ||
123 | #else | ||
124 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
125 | pmd_t *pmd) | ||
126 | { | ||
127 | BUG(); | ||
128 | } | ||
129 | #endif /* CONFIG_NUMA_BALANCING */ | ||
130 | |||
131 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | ||
132 | pud_t *pud, unsigned long addr, unsigned long end, | ||
133 | pgprot_t newprot, int dirty_accountable, int prot_numa) | ||
84 | { | 134 | { |
85 | pmd_t *pmd; | 135 | pmd_t *pmd; |
86 | unsigned long next; | 136 | unsigned long next; |
137 | unsigned long pages = 0; | ||
138 | bool all_same_node; | ||
87 | 139 | ||
88 | pmd = pmd_offset(pud, addr); | 140 | pmd = pmd_offset(pud, addr); |
89 | do { | 141 | do { |
90 | next = pmd_addr_end(addr, end); | 142 | next = pmd_addr_end(addr, end); |
91 | if (pmd_trans_huge(*pmd)) { | 143 | if (pmd_trans_huge(*pmd)) { |
92 | if (next - addr != HPAGE_PMD_SIZE) | 144 | if (next - addr != HPAGE_PMD_SIZE) |
93 | split_huge_page_pmd(vma->vm_mm, pmd); | 145 | split_huge_page_pmd(vma, addr, pmd); |
94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | 146 | else if (change_huge_pmd(vma, pmd, addr, newprot, |
147 | prot_numa)) { | ||
148 | pages += HPAGE_PMD_NR; | ||
95 | continue; | 149 | continue; |
150 | } | ||
96 | /* fall through */ | 151 | /* fall through */ |
97 | } | 152 | } |
98 | if (pmd_none_or_clear_bad(pmd)) | 153 | if (pmd_none_or_clear_bad(pmd)) |
99 | continue; | 154 | continue; |
100 | change_pte_range(vma->vm_mm, pmd, addr, next, newprot, | 155 | pages += change_pte_range(vma, pmd, addr, next, newprot, |
101 | dirty_accountable); | 156 | dirty_accountable, prot_numa, &all_same_node); |
157 | |||
158 | /* | ||
159 | * If we are changing protections for NUMA hinting faults then | ||
160 | * set pmd_numa if the examined pages were all on the same | ||
161 | * node. This allows a regular PMD to be handled as one fault | ||
162 | * and effectively batches the taking of the PTL | ||
163 | */ | ||
164 | if (prot_numa && all_same_node) | ||
165 | change_pmd_protnuma(vma->vm_mm, addr, pmd); | ||
102 | } while (pmd++, addr = next, addr != end); | 166 | } while (pmd++, addr = next, addr != end); |
167 | |||
168 | return pages; | ||
103 | } | 169 | } |
104 | 170 | ||
105 | static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 171 | static inline unsigned long change_pud_range(struct vm_area_struct *vma, |
106 | unsigned long addr, unsigned long end, pgprot_t newprot, | 172 | pgd_t *pgd, unsigned long addr, unsigned long end, |
107 | int dirty_accountable) | 173 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
108 | { | 174 | { |
109 | pud_t *pud; | 175 | pud_t *pud; |
110 | unsigned long next; | 176 | unsigned long next; |
177 | unsigned long pages = 0; | ||
111 | 178 | ||
112 | pud = pud_offset(pgd, addr); | 179 | pud = pud_offset(pgd, addr); |
113 | do { | 180 | do { |
114 | next = pud_addr_end(addr, end); | 181 | next = pud_addr_end(addr, end); |
115 | if (pud_none_or_clear_bad(pud)) | 182 | if (pud_none_or_clear_bad(pud)) |
116 | continue; | 183 | continue; |
117 | change_pmd_range(vma, pud, addr, next, newprot, | 184 | pages += change_pmd_range(vma, pud, addr, next, newprot, |
118 | dirty_accountable); | 185 | dirty_accountable, prot_numa); |
119 | } while (pud++, addr = next, addr != end); | 186 | } while (pud++, addr = next, addr != end); |
187 | |||
188 | return pages; | ||
120 | } | 189 | } |
121 | 190 | ||
122 | static void change_protection(struct vm_area_struct *vma, | 191 | static unsigned long change_protection_range(struct vm_area_struct *vma, |
123 | unsigned long addr, unsigned long end, pgprot_t newprot, | 192 | unsigned long addr, unsigned long end, pgprot_t newprot, |
124 | int dirty_accountable) | 193 | int dirty_accountable, int prot_numa) |
125 | { | 194 | { |
126 | struct mm_struct *mm = vma->vm_mm; | 195 | struct mm_struct *mm = vma->vm_mm; |
127 | pgd_t *pgd; | 196 | pgd_t *pgd; |
128 | unsigned long next; | 197 | unsigned long next; |
129 | unsigned long start = addr; | 198 | unsigned long start = addr; |
199 | unsigned long pages = 0; | ||
130 | 200 | ||
131 | BUG_ON(addr >= end); | 201 | BUG_ON(addr >= end); |
132 | pgd = pgd_offset(mm, addr); | 202 | pgd = pgd_offset(mm, addr); |
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma, | |||
135 | next = pgd_addr_end(addr, end); | 205 | next = pgd_addr_end(addr, end); |
136 | if (pgd_none_or_clear_bad(pgd)) | 206 | if (pgd_none_or_clear_bad(pgd)) |
137 | continue; | 207 | continue; |
138 | change_pud_range(vma, pgd, addr, next, newprot, | 208 | pages += change_pud_range(vma, pgd, addr, next, newprot, |
139 | dirty_accountable); | 209 | dirty_accountable, prot_numa); |
140 | } while (pgd++, addr = next, addr != end); | 210 | } while (pgd++, addr = next, addr != end); |
141 | flush_tlb_range(vma, start, end); | 211 | |
212 | /* Only flush the TLB if we actually modified any entries: */ | ||
213 | if (pages) | ||
214 | flush_tlb_range(vma, start, end); | ||
215 | |||
216 | return pages; | ||
217 | } | ||
218 | |||
219 | unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, | ||
220 | unsigned long end, pgprot_t newprot, | ||
221 | int dirty_accountable, int prot_numa) | ||
222 | { | ||
223 | struct mm_struct *mm = vma->vm_mm; | ||
224 | unsigned long pages; | ||
225 | |||
226 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
227 | if (is_vm_hugetlb_page(vma)) | ||
228 | pages = hugetlb_change_protection(vma, start, end, newprot); | ||
229 | else | ||
230 | pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); | ||
231 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
232 | |||
233 | return pages; | ||
142 | } | 234 | } |
143 | 235 | ||
144 | int | 236 | int |
@@ -213,12 +305,9 @@ success: | |||
213 | dirty_accountable = 1; | 305 | dirty_accountable = 1; |
214 | } | 306 | } |
215 | 307 | ||
216 | mmu_notifier_invalidate_range_start(mm, start, end); | 308 | change_protection(vma, start, end, vma->vm_page_prot, |
217 | if (is_vm_hugetlb_page(vma)) | 309 | dirty_accountable, 0); |
218 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | 310 | |
219 | else | ||
220 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | ||
221 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
222 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 311 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
223 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 312 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
224 | perf_event_mmap(vma); | 313 | perf_event_mmap(vma); |
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
274 | error = -EINVAL; | 363 | error = -EINVAL; |
275 | if (!(vma->vm_flags & VM_GROWSDOWN)) | 364 | if (!(vma->vm_flags & VM_GROWSDOWN)) |
276 | goto out; | 365 | goto out; |
277 | } | 366 | } else { |
278 | else { | ||
279 | if (vma->vm_start > start) | 367 | if (vma->vm_start > start) |
280 | goto out; | 368 | goto out; |
281 | if (unlikely(grows & PROT_GROWSUP)) { | 369 | if (unlikely(grows & PROT_GROWSUP)) { |
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
291 | for (nstart = start ; ; ) { | 379 | for (nstart = start ; ; ) { |
292 | unsigned long newflags; | 380 | unsigned long newflags; |
293 | 381 | ||
294 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 382 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
295 | 383 | ||
296 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | 384 | newflags = vm_flags; |
385 | newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | ||
297 | 386 | ||
298 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ | 387 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ |
299 | if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { | 388 | if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { |
diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307a..e1031e1f6a61 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
104 | } | 104 | } |
105 | if (vma->anon_vma) { | 105 | if (vma->anon_vma) { |
106 | anon_vma = vma->anon_vma; | 106 | anon_vma = vma->anon_vma; |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | } | 108 | } |
109 | } | 109 | } |
110 | 110 | ||
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
182 | need_flush = true; | 182 | need_flush = true; |
183 | continue; | 183 | continue; |
184 | } else if (!err) { | 184 | } else if (!err) { |
185 | split_huge_page_pmd(vma->vm_mm, old_pmd); | 185 | split_huge_page_pmd(vma, old_addr, old_pmd); |
186 | } | 186 | } |
187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | 187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); |
188 | } | 188 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b31411..b8294fc03df8 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
137 | return count; | 137 | return count; |
138 | } | 138 | } |
139 | 139 | ||
140 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
141 | { | ||
142 | struct zone *z; | ||
143 | |||
144 | /* | ||
145 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
146 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
147 | * zones. So there's no need to recalculate managed_pages because all | ||
148 | * highmem pages will be managed by the buddy system. Here highmem | ||
149 | * zone also includes highmem movable zone. | ||
150 | */ | ||
151 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
152 | if (!is_highmem(z)) | ||
153 | z->managed_pages = 0; | ||
154 | } | ||
155 | |||
140 | /** | 156 | /** |
141 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 157 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
142 | * @pgdat: node to be released | 158 | * @pgdat: node to be released |
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
146 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 162 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
147 | { | 163 | { |
148 | register_page_bootmem_info_node(pgdat); | 164 | register_page_bootmem_info_node(pgdat); |
165 | reset_node_lowmem_managed_pages(pgdat); | ||
149 | 166 | ||
150 | /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ | 167 | /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ |
151 | return 0; | 168 | return 0; |
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
158 | */ | 175 | */ |
159 | unsigned long __init free_all_bootmem(void) | 176 | unsigned long __init free_all_bootmem(void) |
160 | { | 177 | { |
178 | struct pglist_data *pgdat; | ||
179 | |||
180 | for_each_online_pgdat(pgdat) | ||
181 | reset_node_lowmem_managed_pages(pgdat); | ||
182 | |||
161 | /* | 183 | /* |
162 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 184 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
163 | * because in some case like Node0 doesn't have RAM installed | 185 | * because in some case like Node0 doesn't have RAM installed |
diff --git a/mm/nommu.c b/mm/nommu.c index 45131b41bcdb..79c3cac87afa 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -66,6 +66,21 @@ int heap_stack_gap = 0; | |||
66 | 66 | ||
67 | atomic_long_t mmap_pages_allocated; | 67 | atomic_long_t mmap_pages_allocated; |
68 | 68 | ||
69 | /* | ||
70 | * The global memory commitment made in the system can be a metric | ||
71 | * that can be used to drive ballooning decisions when Linux is hosted | ||
72 | * as a guest. On Hyper-V, the host implements a policy engine for dynamically | ||
73 | * balancing memory across competing virtual machines that are hosted. | ||
74 | * Several metrics drive this policy engine including the guest reported | ||
75 | * memory commitment. | ||
76 | */ | ||
77 | unsigned long vm_memory_committed(void) | ||
78 | { | ||
79 | return percpu_counter_read_positive(&vm_committed_as); | ||
80 | } | ||
81 | |||
82 | EXPORT_SYMBOL_GPL(vm_memory_committed); | ||
83 | |||
69 | EXPORT_SYMBOL(mem_map); | 84 | EXPORT_SYMBOL(mem_map); |
70 | EXPORT_SYMBOL(num_physpages); | 85 | EXPORT_SYMBOL(num_physpages); |
71 | 86 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 79e0f3e24831..0399f146ae49 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task; | |||
44 | int sysctl_oom_dump_tasks = 1; | 44 | int sysctl_oom_dump_tasks = 1; |
45 | static DEFINE_SPINLOCK(zone_scan_lock); | 45 | static DEFINE_SPINLOCK(zone_scan_lock); |
46 | 46 | ||
47 | /* | ||
48 | * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj | ||
49 | * @old_val: old oom_score_adj for compare | ||
50 | * @new_val: new oom_score_adj for swap | ||
51 | * | ||
52 | * Sets the oom_score_adj value for current to @new_val iff its present value is | ||
53 | * @old_val. Usually used to reinstate a previous value to prevent racing with | ||
54 | * userspacing tuning the value in the interim. | ||
55 | */ | ||
56 | void compare_swap_oom_score_adj(int old_val, int new_val) | ||
57 | { | ||
58 | struct sighand_struct *sighand = current->sighand; | ||
59 | |||
60 | spin_lock_irq(&sighand->siglock); | ||
61 | if (current->signal->oom_score_adj == old_val) | ||
62 | current->signal->oom_score_adj = new_val; | ||
63 | trace_oom_score_adj_update(current); | ||
64 | spin_unlock_irq(&sighand->siglock); | ||
65 | } | ||
66 | |||
67 | /** | ||
68 | * test_set_oom_score_adj() - set current's oom_score_adj and return old value | ||
69 | * @new_val: new oom_score_adj value | ||
70 | * | ||
71 | * Sets the oom_score_adj value for current to @new_val with proper | ||
72 | * synchronization and returns the old value. Usually used to temporarily | ||
73 | * set a value, save the old value in the caller, and then reinstate it later. | ||
74 | */ | ||
75 | int test_set_oom_score_adj(int new_val) | ||
76 | { | ||
77 | struct sighand_struct *sighand = current->sighand; | ||
78 | int old_val; | ||
79 | |||
80 | spin_lock_irq(&sighand->siglock); | ||
81 | old_val = current->signal->oom_score_adj; | ||
82 | current->signal->oom_score_adj = new_val; | ||
83 | trace_oom_score_adj_update(current); | ||
84 | spin_unlock_irq(&sighand->siglock); | ||
85 | |||
86 | return old_val; | ||
87 | } | ||
88 | |||
89 | #ifdef CONFIG_NUMA | 47 | #ifdef CONFIG_NUMA |
90 | /** | 48 | /** |
91 | * has_intersects_mems_allowed() - check task eligiblity for kill | 49 | * has_intersects_mems_allowed() - check task eligiblity for kill |
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
193 | if (!p) | 151 | if (!p) |
194 | return 0; | 152 | return 0; |
195 | 153 | ||
196 | adj = p->signal->oom_score_adj; | 154 | adj = (long)p->signal->oom_score_adj; |
197 | if (adj == OOM_SCORE_ADJ_MIN) { | 155 | if (adj == OOM_SCORE_ADJ_MIN) { |
198 | task_unlock(p); | 156 | task_unlock(p); |
199 | return 0; | 157 | return 0; |
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
257 | * the page allocator means a mempolicy is in effect. Cpuset policy | 215 | * the page allocator means a mempolicy is in effect. Cpuset policy |
258 | * is enforced in get_page_from_freelist(). | 216 | * is enforced in get_page_from_freelist(). |
259 | */ | 217 | */ |
260 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { | 218 | if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { |
261 | *totalpages = total_swap_pages; | 219 | *totalpages = total_swap_pages; |
262 | for_each_node_mask(nid, *nodemask) | 220 | for_each_node_mask(nid, *nodemask) |
263 | *totalpages += node_spanned_pages(nid); | 221 | *totalpages += node_spanned_pages(nid); |
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
310 | if (!task->mm) | 268 | if (!task->mm) |
311 | return OOM_SCAN_CONTINUE; | 269 | return OOM_SCAN_CONTINUE; |
312 | 270 | ||
313 | if (task->flags & PF_EXITING) { | 271 | /* |
272 | * If task is allocating a lot of memory and has been marked to be | ||
273 | * killed first if it triggers an oom, then select it. | ||
274 | */ | ||
275 | if (oom_task_origin(task)) | ||
276 | return OOM_SCAN_SELECT; | ||
277 | |||
278 | if (task->flags & PF_EXITING && !force_kill) { | ||
314 | /* | 279 | /* |
315 | * If task is current and is in the process of releasing memory, | 280 | * If this task is not being ptraced on exit, then wait for it |
316 | * allow the "kill" to set TIF_MEMDIE, which will allow it to | 281 | * to finish before killing some other task unnecessarily. |
317 | * access memory reserves. Otherwise, it may stall forever. | ||
318 | * | ||
319 | * The iteration isn't broken here, however, in case other | ||
320 | * threads are found to have already been oom killed. | ||
321 | */ | 282 | */ |
322 | if (task == current) | 283 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) |
323 | return OOM_SCAN_SELECT; | 284 | return OOM_SCAN_ABORT; |
324 | else if (!force_kill) { | ||
325 | /* | ||
326 | * If this task is not being ptraced on exit, then wait | ||
327 | * for it to finish before killing some other task | ||
328 | * unnecessarily. | ||
329 | */ | ||
330 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
331 | return OOM_SCAN_ABORT; | ||
332 | } | ||
333 | } | 285 | } |
334 | return OOM_SCAN_OK; | 286 | return OOM_SCAN_OK; |
335 | } | 287 | } |
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
412 | continue; | 364 | continue; |
413 | } | 365 | } |
414 | 366 | ||
415 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", | 367 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", |
416 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 368 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
417 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 369 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
418 | task->mm->nr_ptes, | 370 | task->mm->nr_ptes, |
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
428 | { | 380 | { |
429 | task_lock(current); | 381 | task_lock(current); |
430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 382 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
431 | "oom_score_adj=%d\n", | 383 | "oom_score_adj=%hd\n", |
432 | current->comm, gfp_mask, order, | 384 | current->comm, gfp_mask, order, |
433 | current->signal->oom_score_adj); | 385 | current->signal->oom_score_adj); |
434 | cpuset_print_task_mems_allowed(current); | 386 | cpuset_print_task_mems_allowed(current); |
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
639 | spin_unlock(&zone_scan_lock); | 591 | spin_unlock(&zone_scan_lock); |
640 | } | 592 | } |
641 | 593 | ||
642 | /* | ||
643 | * Try to acquire the oom killer lock for all system zones. Returns zero if a | ||
644 | * parallel oom killing is taking place, otherwise locks all zones and returns | ||
645 | * non-zero. | ||
646 | */ | ||
647 | static int try_set_system_oom(void) | ||
648 | { | ||
649 | struct zone *zone; | ||
650 | int ret = 1; | ||
651 | |||
652 | spin_lock(&zone_scan_lock); | ||
653 | for_each_populated_zone(zone) | ||
654 | if (zone_is_oom_locked(zone)) { | ||
655 | ret = 0; | ||
656 | goto out; | ||
657 | } | ||
658 | for_each_populated_zone(zone) | ||
659 | zone_set_flag(zone, ZONE_OOM_LOCKED); | ||
660 | out: | ||
661 | spin_unlock(&zone_scan_lock); | ||
662 | return ret; | ||
663 | } | ||
664 | |||
665 | /* | ||
666 | * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation | ||
667 | * attempts or page faults may now recall the oom killer, if necessary. | ||
668 | */ | ||
669 | static void clear_system_oom(void) | ||
670 | { | ||
671 | struct zone *zone; | ||
672 | |||
673 | spin_lock(&zone_scan_lock); | ||
674 | for_each_populated_zone(zone) | ||
675 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | ||
676 | spin_unlock(&zone_scan_lock); | ||
677 | } | ||
678 | |||
679 | /** | 594 | /** |
680 | * out_of_memory - kill the "best" process when we run out of memory | 595 | * out_of_memory - kill the "best" process when we run out of memory |
681 | * @zonelist: zonelist pointer | 596 | * @zonelist: zonelist pointer |
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
706 | return; | 621 | return; |
707 | 622 | ||
708 | /* | 623 | /* |
709 | * If current has a pending SIGKILL, then automatically select it. The | 624 | * If current has a pending SIGKILL or is exiting, then automatically |
710 | * goal is to allow it to allocate so that it may quickly exit and free | 625 | * select it. The goal is to allow it to allocate so that it may |
711 | * its memory. | 626 | * quickly exit and free its memory. |
712 | */ | 627 | */ |
713 | if (fatal_signal_pending(current)) { | 628 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { |
714 | set_thread_flag(TIF_MEMDIE); | 629 | set_thread_flag(TIF_MEMDIE); |
715 | return; | 630 | return; |
716 | } | 631 | } |
@@ -756,15 +671,16 @@ out: | |||
756 | 671 | ||
757 | /* | 672 | /* |
758 | * The pagefault handler calls here because it is out of memory, so kill a | 673 | * The pagefault handler calls here because it is out of memory, so kill a |
759 | * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel | 674 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a |
760 | * oom killing is already in progress so do nothing. If a task is found with | 675 | * parallel oom killing is already in progress so do nothing. |
761 | * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. | ||
762 | */ | 676 | */ |
763 | void pagefault_out_of_memory(void) | 677 | void pagefault_out_of_memory(void) |
764 | { | 678 | { |
765 | if (try_set_system_oom()) { | 679 | struct zonelist *zonelist = node_zonelist(first_online_node, |
680 | GFP_KERNEL); | ||
681 | |||
682 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | ||
766 | out_of_memory(NULL, 0, 0, NULL, false); | 683 | out_of_memory(NULL, 0, 0, NULL, false); |
767 | clear_system_oom(); | 684 | clear_zonelist_oom(zonelist, GFP_KERNEL); |
768 | } | 685 | } |
769 | schedule_timeout_killable(1); | ||
770 | } | 686 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 830893b2b3c7..6f4271224493 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, | |||
1069 | } | 1069 | } |
1070 | 1070 | ||
1071 | /* | 1071 | /* |
1072 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() | 1072 | * After a task dirtied this many pages, balance_dirty_pages_ratelimited() |
1073 | * will look to see if it needs to start dirty throttling. | 1073 | * will look to see if it needs to start dirty throttling. |
1074 | * | 1074 | * |
1075 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive | 1075 | * If dirty_poll_interval is too low, big NUMA machines will call the expensive |
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); | |||
1436 | DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | 1436 | DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; |
1437 | 1437 | ||
1438 | /** | 1438 | /** |
1439 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 1439 | * balance_dirty_pages_ratelimited - balance dirty memory state |
1440 | * @mapping: address_space which was dirtied | 1440 | * @mapping: address_space which was dirtied |
1441 | * @nr_pages_dirtied: number of pages which the caller has just dirtied | ||
1442 | * | 1441 | * |
1443 | * Processes which are dirtying memory should call in here once for each page | 1442 | * Processes which are dirtying memory should call in here once for each page |
1444 | * which was newly dirtied. The function will periodically check the system's | 1443 | * which was newly dirtied. The function will periodically check the system's |
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | |||
1449 | * limit we decrease the ratelimiting by a lot, to prevent individual processes | 1448 | * limit we decrease the ratelimiting by a lot, to prevent individual processes |
1450 | * from overshooting the limit by (ratelimit_pages) each. | 1449 | * from overshooting the limit by (ratelimit_pages) each. |
1451 | */ | 1450 | */ |
1452 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 1451 | void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1453 | unsigned long nr_pages_dirtied) | ||
1454 | { | 1452 | { |
1455 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1453 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1456 | int ratelimit; | 1454 | int ratelimit; |
@@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1484 | */ | 1482 | */ |
1485 | p = &__get_cpu_var(dirty_throttle_leaks); | 1483 | p = &__get_cpu_var(dirty_throttle_leaks); |
1486 | if (*p > 0 && current->nr_dirtied < ratelimit) { | 1484 | if (*p > 0 && current->nr_dirtied < ratelimit) { |
1485 | unsigned long nr_pages_dirtied; | ||
1487 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); | 1486 | nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); |
1488 | *p -= nr_pages_dirtied; | 1487 | *p -= nr_pages_dirtied; |
1489 | current->nr_dirtied += nr_pages_dirtied; | 1488 | current->nr_dirtied += nr_pages_dirtied; |
@@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
1493 | if (unlikely(current->nr_dirtied >= ratelimit)) | 1492 | if (unlikely(current->nr_dirtied >= ratelimit)) |
1494 | balance_dirty_pages(mapping, current->nr_dirtied); | 1493 | balance_dirty_pages(mapping, current->nr_dirtied); |
1495 | } | 1494 | } |
1496 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | 1495 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited); |
1497 | 1496 | ||
1498 | void throttle_vm_writeout(gfp_t gfp_mask) | 1497 | void throttle_vm_writeout(gfp_t gfp_mask) |
1499 | { | 1498 | { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68c..2ad2ad168efe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
90 | #ifdef CONFIG_HIGHMEM | 90 | #ifdef CONFIG_HIGHMEM |
91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_MOVABLE_NODE | ||
94 | [N_MEMORY] = { { [0] = 1UL } }, | ||
95 | #endif | ||
93 | [N_CPU] = { { [0] = 1UL } }, | 96 | [N_CPU] = { { [0] = 1UL } }, |
94 | #endif /* NUMA */ | 97 | #endif /* NUMA */ |
95 | }; | 98 | }; |
@@ -368,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
368 | int nr_pages = 1 << order; | 371 | int nr_pages = 1 << order; |
369 | int bad = 0; | 372 | int bad = 0; |
370 | 373 | ||
371 | if (unlikely(compound_order(page) != order) || | 374 | if (unlikely(compound_order(page) != order)) { |
372 | unlikely(!PageHead(page))) { | ||
373 | bad_page(page); | 375 | bad_page(page); |
374 | bad++; | 376 | bad++; |
375 | } | 377 | } |
@@ -523,7 +525,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
523 | * If a block is freed, and its buddy is also free, then this | 525 | * If a block is freed, and its buddy is also free, then this |
524 | * triggers coalescing into a block of larger size. | 526 | * triggers coalescing into a block of larger size. |
525 | * | 527 | * |
526 | * -- wli | 528 | * -- nyc |
527 | */ | 529 | */ |
528 | 530 | ||
529 | static inline void __free_one_page(struct page *page, | 531 | static inline void __free_one_page(struct page *page, |
@@ -608,6 +610,7 @@ static inline int free_pages_check(struct page *page) | |||
608 | bad_page(page); | 610 | bad_page(page); |
609 | return 1; | 611 | return 1; |
610 | } | 612 | } |
613 | reset_page_last_nid(page); | ||
611 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 614 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
612 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 615 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
613 | return 0; | 616 | return 0; |
@@ -667,11 +670,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 670 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
668 | __free_one_page(page, zone, 0, mt); | 671 | __free_one_page(page, zone, 0, mt); |
669 | trace_mm_page_pcpu_drain(page, 0, mt); | 672 | trace_mm_page_pcpu_drain(page, 0, mt); |
670 | if (is_migrate_cma(mt)) | 673 | if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { |
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | 674 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
675 | if (is_migrate_cma(mt)) | ||
676 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | ||
677 | } | ||
672 | } while (--to_free && --batch_free && !list_empty(list)); | 678 | } while (--to_free && --batch_free && !list_empty(list)); |
673 | } | 679 | } |
674 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | ||
675 | spin_unlock(&zone->lock); | 680 | spin_unlock(&zone->lock); |
676 | } | 681 | } |
677 | 682 | ||
@@ -730,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
730 | local_irq_restore(flags); | 735 | local_irq_restore(flags); |
731 | } | 736 | } |
732 | 737 | ||
738 | /* | ||
739 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
740 | * but we still need to serialize writers. Currently all callers of | ||
741 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
742 | * at boot time. So for shorter boot time, we shift the burden to | ||
743 | * put_page_bootmem() to serialize writers. | ||
744 | */ | ||
733 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 745 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
734 | { | 746 | { |
735 | unsigned int nr_pages = 1 << order; | 747 | unsigned int nr_pages = 1 << order; |
@@ -745,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
745 | set_page_count(p, 0); | 757 | set_page_count(p, 0); |
746 | } | 758 | } |
747 | 759 | ||
760 | page_zone(page)->managed_pages += 1 << order; | ||
748 | set_page_refcounted(page); | 761 | set_page_refcounted(page); |
749 | __free_pages(page, order); | 762 | __free_pages(page, order); |
750 | } | 763 | } |
@@ -780,7 +793,7 @@ void __init init_cma_reserved_pageblock(struct page *page) | |||
780 | * large block of memory acted on by a series of small allocations. | 793 | * large block of memory acted on by a series of small allocations. |
781 | * This behavior is a critical factor in sglist merging's success. | 794 | * This behavior is a critical factor in sglist merging's success. |
782 | * | 795 | * |
783 | * -- wli | 796 | * -- nyc |
784 | */ | 797 | */ |
785 | static inline void expand(struct zone *zone, struct page *page, | 798 | static inline void expand(struct zone *zone, struct page *page, |
786 | int low, int high, struct free_area *area, | 799 | int low, int high, struct free_area *area, |
@@ -1392,21 +1405,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1392 | 1405 | ||
1393 | zone = page_zone(page); | 1406 | zone = page_zone(page); |
1394 | order = page_order(page); | 1407 | order = page_order(page); |
1408 | mt = get_pageblock_migratetype(page); | ||
1395 | 1409 | ||
1396 | /* Obey watermarks as if the page was being allocated */ | 1410 | if (mt != MIGRATE_ISOLATE) { |
1397 | watermark = low_wmark_pages(zone) + (1 << order); | 1411 | /* Obey watermarks as if the page was being allocated */ |
1398 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1412 | watermark = low_wmark_pages(zone) + (1 << order); |
1399 | return 0; | 1413 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1414 | return 0; | ||
1415 | |||
1416 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | ||
1417 | } | ||
1400 | 1418 | ||
1401 | /* Remove page from free list */ | 1419 | /* Remove page from free list */ |
1402 | list_del(&page->lru); | 1420 | list_del(&page->lru); |
1403 | zone->free_area[order].nr_free--; | 1421 | zone->free_area[order].nr_free--; |
1404 | rmv_page_order(page); | 1422 | rmv_page_order(page); |
1405 | 1423 | ||
1406 | mt = get_pageblock_migratetype(page); | ||
1407 | if (unlikely(mt != MIGRATE_ISOLATE)) | ||
1408 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | ||
1409 | |||
1410 | if (alloc_order != order) | 1424 | if (alloc_order != order) |
1411 | expand(zone, page, alloc_order, order, | 1425 | expand(zone, page, alloc_order, order, |
1412 | &zone->free_area[order], migratetype); | 1426 | &zone->free_area[order], migratetype); |
@@ -1692,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1692 | * | 1706 | * |
1693 | * If the zonelist cache is present in the passed in zonelist, then | 1707 | * If the zonelist cache is present in the passed in zonelist, then |
1694 | * returns a pointer to the allowed node mask (either the current | 1708 | * returns a pointer to the allowed node mask (either the current |
1695 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1709 | * tasks mems_allowed, or node_states[N_MEMORY].) |
1696 | * | 1710 | * |
1697 | * If the zonelist cache is not available for this zonelist, does | 1711 | * If the zonelist cache is not available for this zonelist, does |
1698 | * nothing and returns NULL. | 1712 | * nothing and returns NULL. |
@@ -1721,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1721 | 1735 | ||
1722 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1736 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1723 | &cpuset_current_mems_allowed : | 1737 | &cpuset_current_mems_allowed : |
1724 | &node_states[N_HIGH_MEMORY]; | 1738 | &node_states[N_MEMORY]; |
1725 | return allowednodes; | 1739 | return allowednodes; |
1726 | } | 1740 | } |
1727 | 1741 | ||
@@ -1871,7 +1885,7 @@ zonelist_scan: | |||
1871 | */ | 1885 | */ |
1872 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1886 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1873 | high_zoneidx, nodemask) { | 1887 | high_zoneidx, nodemask) { |
1874 | if (NUMA_BUILD && zlc_active && | 1888 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1875 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1889 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1876 | continue; | 1890 | continue; |
1877 | if ((alloc_flags & ALLOC_CPUSET) && | 1891 | if ((alloc_flags & ALLOC_CPUSET) && |
@@ -1917,7 +1931,8 @@ zonelist_scan: | |||
1917 | classzone_idx, alloc_flags)) | 1931 | classzone_idx, alloc_flags)) |
1918 | goto try_this_zone; | 1932 | goto try_this_zone; |
1919 | 1933 | ||
1920 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | 1934 | if (IS_ENABLED(CONFIG_NUMA) && |
1935 | !did_zlc_setup && nr_online_nodes > 1) { | ||
1921 | /* | 1936 | /* |
1922 | * we do zlc_setup if there are multiple nodes | 1937 | * we do zlc_setup if there are multiple nodes |
1923 | * and before considering the first zone allowed | 1938 | * and before considering the first zone allowed |
@@ -1936,7 +1951,7 @@ zonelist_scan: | |||
1936 | * As we may have just activated ZLC, check if the first | 1951 | * As we may have just activated ZLC, check if the first |
1937 | * eligible zone has failed zone_reclaim recently. | 1952 | * eligible zone has failed zone_reclaim recently. |
1938 | */ | 1953 | */ |
1939 | if (NUMA_BUILD && zlc_active && | 1954 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1940 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1955 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1941 | continue; | 1956 | continue; |
1942 | 1957 | ||
@@ -1962,11 +1977,11 @@ try_this_zone: | |||
1962 | if (page) | 1977 | if (page) |
1963 | break; | 1978 | break; |
1964 | this_zone_full: | 1979 | this_zone_full: |
1965 | if (NUMA_BUILD) | 1980 | if (IS_ENABLED(CONFIG_NUMA)) |
1966 | zlc_mark_zone_full(zonelist, z); | 1981 | zlc_mark_zone_full(zonelist, z); |
1967 | } | 1982 | } |
1968 | 1983 | ||
1969 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1984 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { |
1970 | /* Disable zlc cache for second zonelist scan */ | 1985 | /* Disable zlc cache for second zonelist scan */ |
1971 | zlc_active = 0; | 1986 | zlc_active = 0; |
1972 | goto zonelist_scan; | 1987 | goto zonelist_scan; |
@@ -2266,7 +2281,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2266 | return NULL; | 2281 | return NULL; |
2267 | 2282 | ||
2268 | /* After successful reclaim, reconsider all zones for allocation */ | 2283 | /* After successful reclaim, reconsider all zones for allocation */ |
2269 | if (NUMA_BUILD) | 2284 | if (IS_ENABLED(CONFIG_NUMA)) |
2270 | zlc_clear_zones_full(zonelist); | 2285 | zlc_clear_zones_full(zonelist); |
2271 | 2286 | ||
2272 | retry: | 2287 | retry: |
@@ -2412,7 +2427,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2412 | * allowed per node queues are empty and that nodes are | 2427 | * allowed per node queues are empty and that nodes are |
2413 | * over allocated. | 2428 | * over allocated. |
2414 | */ | 2429 | */ |
2415 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2430 | if (IS_ENABLED(CONFIG_NUMA) && |
2431 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
2416 | goto nopage; | 2432 | goto nopage; |
2417 | 2433 | ||
2418 | restart: | 2434 | restart: |
@@ -2596,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2596 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2612 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2597 | unsigned int cpuset_mems_cookie; | 2613 | unsigned int cpuset_mems_cookie; |
2598 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; | 2614 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; |
2615 | struct mem_cgroup *memcg = NULL; | ||
2599 | 2616 | ||
2600 | gfp_mask &= gfp_allowed_mask; | 2617 | gfp_mask &= gfp_allowed_mask; |
2601 | 2618 | ||
@@ -2614,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2614 | if (unlikely(!zonelist->_zonerefs->zone)) | 2631 | if (unlikely(!zonelist->_zonerefs->zone)) |
2615 | return NULL; | 2632 | return NULL; |
2616 | 2633 | ||
2634 | /* | ||
2635 | * Will only have any effect when __GFP_KMEMCG is set. This is | ||
2636 | * verified in the (always inline) callee | ||
2637 | */ | ||
2638 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2639 | return NULL; | ||
2640 | |||
2617 | retry_cpuset: | 2641 | retry_cpuset: |
2618 | cpuset_mems_cookie = get_mems_allowed(); | 2642 | cpuset_mems_cookie = get_mems_allowed(); |
2619 | 2643 | ||
@@ -2649,6 +2673,8 @@ out: | |||
2649 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2673 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
2650 | goto retry_cpuset; | 2674 | goto retry_cpuset; |
2651 | 2675 | ||
2676 | memcg_kmem_commit_charge(page, memcg, order); | ||
2677 | |||
2652 | return page; | 2678 | return page; |
2653 | } | 2679 | } |
2654 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2680 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2701,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order) | |||
2701 | 2727 | ||
2702 | EXPORT_SYMBOL(free_pages); | 2728 | EXPORT_SYMBOL(free_pages); |
2703 | 2729 | ||
2730 | /* | ||
2731 | * __free_memcg_kmem_pages and free_memcg_kmem_pages will free | ||
2732 | * pages allocated with __GFP_KMEMCG. | ||
2733 | * | ||
2734 | * Those pages are accounted to a particular memcg, embedded in the | ||
2735 | * corresponding page_cgroup. To avoid adding a hit in the allocator to search | ||
2736 | * for that information only to find out that it is NULL for users who have no | ||
2737 | * interest in that whatsoever, we provide these functions. | ||
2738 | * | ||
2739 | * The caller knows better which flags it relies on. | ||
2740 | */ | ||
2741 | void __free_memcg_kmem_pages(struct page *page, unsigned int order) | ||
2742 | { | ||
2743 | memcg_kmem_uncharge_pages(page, order); | ||
2744 | __free_pages(page, order); | ||
2745 | } | ||
2746 | |||
2747 | void free_memcg_kmem_pages(unsigned long addr, unsigned int order) | ||
2748 | { | ||
2749 | if (addr != 0) { | ||
2750 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | ||
2751 | __free_memcg_kmem_pages(virt_to_page((void *)addr), order); | ||
2752 | } | ||
2753 | } | ||
2754 | |||
2704 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) | 2755 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) |
2705 | { | 2756 | { |
2706 | if (addr) { | 2757 | if (addr) { |
@@ -2819,7 +2870,7 @@ unsigned int nr_free_pagecache_pages(void) | |||
2819 | 2870 | ||
2820 | static inline void show_node(struct zone *zone) | 2871 | static inline void show_node(struct zone *zone) |
2821 | { | 2872 | { |
2822 | if (NUMA_BUILD) | 2873 | if (IS_ENABLED(CONFIG_NUMA)) |
2823 | printk("Node %d ", zone_to_nid(zone)); | 2874 | printk("Node %d ", zone_to_nid(zone)); |
2824 | } | 2875 | } |
2825 | 2876 | ||
@@ -2877,6 +2928,31 @@ out: | |||
2877 | 2928 | ||
2878 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2929 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2879 | 2930 | ||
2931 | static void show_migration_types(unsigned char type) | ||
2932 | { | ||
2933 | static const char types[MIGRATE_TYPES] = { | ||
2934 | [MIGRATE_UNMOVABLE] = 'U', | ||
2935 | [MIGRATE_RECLAIMABLE] = 'E', | ||
2936 | [MIGRATE_MOVABLE] = 'M', | ||
2937 | [MIGRATE_RESERVE] = 'R', | ||
2938 | #ifdef CONFIG_CMA | ||
2939 | [MIGRATE_CMA] = 'C', | ||
2940 | #endif | ||
2941 | [MIGRATE_ISOLATE] = 'I', | ||
2942 | }; | ||
2943 | char tmp[MIGRATE_TYPES + 1]; | ||
2944 | char *p = tmp; | ||
2945 | int i; | ||
2946 | |||
2947 | for (i = 0; i < MIGRATE_TYPES; i++) { | ||
2948 | if (type & (1 << i)) | ||
2949 | *p++ = types[i]; | ||
2950 | } | ||
2951 | |||
2952 | *p = '\0'; | ||
2953 | printk("(%s) ", tmp); | ||
2954 | } | ||
2955 | |||
2880 | /* | 2956 | /* |
2881 | * Show free area list (used inside shift_scroll-lock stuff) | 2957 | * Show free area list (used inside shift_scroll-lock stuff) |
2882 | * We also calculate the percentage fragmentation. We do this by counting the | 2958 | * We also calculate the percentage fragmentation. We do this by counting the |
@@ -2951,6 +3027,7 @@ void show_free_areas(unsigned int filter) | |||
2951 | " isolated(anon):%lukB" | 3027 | " isolated(anon):%lukB" |
2952 | " isolated(file):%lukB" | 3028 | " isolated(file):%lukB" |
2953 | " present:%lukB" | 3029 | " present:%lukB" |
3030 | " managed:%lukB" | ||
2954 | " mlocked:%lukB" | 3031 | " mlocked:%lukB" |
2955 | " dirty:%lukB" | 3032 | " dirty:%lukB" |
2956 | " writeback:%lukB" | 3033 | " writeback:%lukB" |
@@ -2980,6 +3057,7 @@ void show_free_areas(unsigned int filter) | |||
2980 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 3057 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
2981 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 3058 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
2982 | K(zone->present_pages), | 3059 | K(zone->present_pages), |
3060 | K(zone->managed_pages), | ||
2983 | K(zone_page_state(zone, NR_MLOCK)), | 3061 | K(zone_page_state(zone, NR_MLOCK)), |
2984 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 3062 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
2985 | K(zone_page_state(zone, NR_WRITEBACK)), | 3063 | K(zone_page_state(zone, NR_WRITEBACK)), |
@@ -3005,6 +3083,7 @@ void show_free_areas(unsigned int filter) | |||
3005 | 3083 | ||
3006 | for_each_populated_zone(zone) { | 3084 | for_each_populated_zone(zone) { |
3007 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 3085 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
3086 | unsigned char types[MAX_ORDER]; | ||
3008 | 3087 | ||
3009 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 3088 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
3010 | continue; | 3089 | continue; |
@@ -3013,12 +3092,24 @@ void show_free_areas(unsigned int filter) | |||
3013 | 3092 | ||
3014 | spin_lock_irqsave(&zone->lock, flags); | 3093 | spin_lock_irqsave(&zone->lock, flags); |
3015 | for (order = 0; order < MAX_ORDER; order++) { | 3094 | for (order = 0; order < MAX_ORDER; order++) { |
3016 | nr[order] = zone->free_area[order].nr_free; | 3095 | struct free_area *area = &zone->free_area[order]; |
3096 | int type; | ||
3097 | |||
3098 | nr[order] = area->nr_free; | ||
3017 | total += nr[order] << order; | 3099 | total += nr[order] << order; |
3100 | |||
3101 | types[order] = 0; | ||
3102 | for (type = 0; type < MIGRATE_TYPES; type++) { | ||
3103 | if (!list_empty(&area->free_list[type])) | ||
3104 | types[order] |= 1 << type; | ||
3105 | } | ||
3018 | } | 3106 | } |
3019 | spin_unlock_irqrestore(&zone->lock, flags); | 3107 | spin_unlock_irqrestore(&zone->lock, flags); |
3020 | for (order = 0; order < MAX_ORDER; order++) | 3108 | for (order = 0; order < MAX_ORDER; order++) { |
3021 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | 3109 | printk("%lu*%lukB ", nr[order], K(1UL) << order); |
3110 | if (nr[order]) | ||
3111 | show_migration_types(types[order]); | ||
3112 | } | ||
3022 | printk("= %lukB\n", K(total)); | 3113 | printk("= %lukB\n", K(total)); |
3023 | } | 3114 | } |
3024 | 3115 | ||
@@ -3195,7 +3286,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
3195 | return node; | 3286 | return node; |
3196 | } | 3287 | } |
3197 | 3288 | ||
3198 | for_each_node_state(n, N_HIGH_MEMORY) { | 3289 | for_each_node_state(n, N_MEMORY) { |
3199 | 3290 | ||
3200 | /* Don't want a node to appear more than once */ | 3291 | /* Don't want a node to appear more than once */ |
3201 | if (node_isset(n, *used_node_mask)) | 3292 | if (node_isset(n, *used_node_mask)) |
@@ -3337,7 +3428,7 @@ static int default_zonelist_order(void) | |||
3337 | * local memory, NODE_ORDER may be suitable. | 3428 | * local memory, NODE_ORDER may be suitable. |
3338 | */ | 3429 | */ |
3339 | average_size = total_size / | 3430 | average_size = total_size / |
3340 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3431 | (nodes_weight(node_states[N_MEMORY]) + 1); |
3341 | for_each_online_node(nid) { | 3432 | for_each_online_node(nid) { |
3342 | low_kmem_size = 0; | 3433 | low_kmem_size = 0; |
3343 | total_size = 0; | 3434 | total_size = 0; |
@@ -3827,6 +3918,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3827 | mminit_verify_page_links(page, zone, nid, pfn); | 3918 | mminit_verify_page_links(page, zone, nid, pfn); |
3828 | init_page_count(page); | 3919 | init_page_count(page); |
3829 | reset_page_mapcount(page); | 3920 | reset_page_mapcount(page); |
3921 | reset_page_last_nid(page); | ||
3830 | SetPageReserved(page); | 3922 | SetPageReserved(page); |
3831 | /* | 3923 | /* |
3832 | * Mark the block movable so that blocks are reserved for | 3924 | * Mark the block movable so that blocks are reserved for |
@@ -4433,6 +4525,26 @@ void __init set_pageblock_order(void) | |||
4433 | 4525 | ||
4434 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4526 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4435 | 4527 | ||
4528 | static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | ||
4529 | unsigned long present_pages) | ||
4530 | { | ||
4531 | unsigned long pages = spanned_pages; | ||
4532 | |||
4533 | /* | ||
4534 | * Provide a more accurate estimation if there are holes within | ||
4535 | * the zone and SPARSEMEM is in use. If there are holes within the | ||
4536 | * zone, each populated memory region may cost us one or two extra | ||
4537 | * memmap pages due to alignment because memmap pages for each | ||
4538 | * populated regions may not naturally algined on page boundary. | ||
4539 | * So the (present_pages >> 4) heuristic is a tradeoff for that. | ||
4540 | */ | ||
4541 | if (spanned_pages > present_pages + (present_pages >> 4) && | ||
4542 | IS_ENABLED(CONFIG_SPARSEMEM)) | ||
4543 | pages = present_pages; | ||
4544 | |||
4545 | return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; | ||
4546 | } | ||
4547 | |||
4436 | /* | 4548 | /* |
4437 | * Set up the zone data structures: | 4549 | * Set up the zone data structures: |
4438 | * - mark all pages reserved | 4550 | * - mark all pages reserved |
@@ -4450,54 +4562,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4450 | int ret; | 4562 | int ret; |
4451 | 4563 | ||
4452 | pgdat_resize_init(pgdat); | 4564 | pgdat_resize_init(pgdat); |
4565 | #ifdef CONFIG_NUMA_BALANCING | ||
4566 | spin_lock_init(&pgdat->numabalancing_migrate_lock); | ||
4567 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
4568 | pgdat->numabalancing_migrate_next_window = jiffies; | ||
4569 | #endif | ||
4453 | init_waitqueue_head(&pgdat->kswapd_wait); | 4570 | init_waitqueue_head(&pgdat->kswapd_wait); |
4454 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4571 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4455 | pgdat_page_cgroup_init(pgdat); | 4572 | pgdat_page_cgroup_init(pgdat); |
4456 | 4573 | ||
4457 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4574 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4458 | struct zone *zone = pgdat->node_zones + j; | 4575 | struct zone *zone = pgdat->node_zones + j; |
4459 | unsigned long size, realsize, memmap_pages; | 4576 | unsigned long size, realsize, freesize, memmap_pages; |
4460 | 4577 | ||
4461 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4578 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4462 | realsize = size - zone_absent_pages_in_node(nid, j, | 4579 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, |
4463 | zholes_size); | 4580 | zholes_size); |
4464 | 4581 | ||
4465 | /* | 4582 | /* |
4466 | * Adjust realsize so that it accounts for how much memory | 4583 | * Adjust freesize so that it accounts for how much memory |
4467 | * is used by this zone for memmap. This affects the watermark | 4584 | * is used by this zone for memmap. This affects the watermark |
4468 | * and per-cpu initialisations | 4585 | * and per-cpu initialisations |
4469 | */ | 4586 | */ |
4470 | memmap_pages = | 4587 | memmap_pages = calc_memmap_size(size, realsize); |
4471 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4588 | if (freesize >= memmap_pages) { |
4472 | if (realsize >= memmap_pages) { | 4589 | freesize -= memmap_pages; |
4473 | realsize -= memmap_pages; | ||
4474 | if (memmap_pages) | 4590 | if (memmap_pages) |
4475 | printk(KERN_DEBUG | 4591 | printk(KERN_DEBUG |
4476 | " %s zone: %lu pages used for memmap\n", | 4592 | " %s zone: %lu pages used for memmap\n", |
4477 | zone_names[j], memmap_pages); | 4593 | zone_names[j], memmap_pages); |
4478 | } else | 4594 | } else |
4479 | printk(KERN_WARNING | 4595 | printk(KERN_WARNING |
4480 | " %s zone: %lu pages exceeds realsize %lu\n", | 4596 | " %s zone: %lu pages exceeds freesize %lu\n", |
4481 | zone_names[j], memmap_pages, realsize); | 4597 | zone_names[j], memmap_pages, freesize); |
4482 | 4598 | ||
4483 | /* Account for reserved pages */ | 4599 | /* Account for reserved pages */ |
4484 | if (j == 0 && realsize > dma_reserve) { | 4600 | if (j == 0 && freesize > dma_reserve) { |
4485 | realsize -= dma_reserve; | 4601 | freesize -= dma_reserve; |
4486 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4602 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
4487 | zone_names[0], dma_reserve); | 4603 | zone_names[0], dma_reserve); |
4488 | } | 4604 | } |
4489 | 4605 | ||
4490 | if (!is_highmem_idx(j)) | 4606 | if (!is_highmem_idx(j)) |
4491 | nr_kernel_pages += realsize; | 4607 | nr_kernel_pages += freesize; |
4492 | nr_all_pages += realsize; | 4608 | /* Charge for highmem memmap if there are enough kernel pages */ |
4609 | else if (nr_kernel_pages > memmap_pages * 2) | ||
4610 | nr_kernel_pages -= memmap_pages; | ||
4611 | nr_all_pages += freesize; | ||
4493 | 4612 | ||
4494 | zone->spanned_pages = size; | 4613 | zone->spanned_pages = size; |
4495 | zone->present_pages = realsize; | 4614 | zone->present_pages = freesize; |
4615 | /* | ||
4616 | * Set an approximate value for lowmem here, it will be adjusted | ||
4617 | * when the bootmem allocator frees pages into the buddy system. | ||
4618 | * And all highmem pages will be managed by the buddy system. | ||
4619 | */ | ||
4620 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; | ||
4496 | #ifdef CONFIG_NUMA | 4621 | #ifdef CONFIG_NUMA |
4497 | zone->node = nid; | 4622 | zone->node = nid; |
4498 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4623 | zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) |
4499 | / 100; | 4624 | / 100; |
4500 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4625 | zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; |
4501 | #endif | 4626 | #endif |
4502 | zone->name = zone_names[j]; | 4627 | zone->name = zone_names[j]; |
4503 | spin_lock_init(&zone->lock); | 4628 | spin_lock_init(&zone->lock); |
@@ -4688,7 +4813,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
4688 | /* | 4813 | /* |
4689 | * early_calculate_totalpages() | 4814 | * early_calculate_totalpages() |
4690 | * Sum pages in active regions for movable zone. | 4815 | * Sum pages in active regions for movable zone. |
4691 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4816 | * Populate N_MEMORY for calculating usable_nodes. |
4692 | */ | 4817 | */ |
4693 | static unsigned long __init early_calculate_totalpages(void) | 4818 | static unsigned long __init early_calculate_totalpages(void) |
4694 | { | 4819 | { |
@@ -4701,7 +4826,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4701 | 4826 | ||
4702 | totalpages += pages; | 4827 | totalpages += pages; |
4703 | if (pages) | 4828 | if (pages) |
4704 | node_set_state(nid, N_HIGH_MEMORY); | 4829 | node_set_state(nid, N_MEMORY); |
4705 | } | 4830 | } |
4706 | return totalpages; | 4831 | return totalpages; |
4707 | } | 4832 | } |
@@ -4718,9 +4843,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4718 | unsigned long usable_startpfn; | 4843 | unsigned long usable_startpfn; |
4719 | unsigned long kernelcore_node, kernelcore_remaining; | 4844 | unsigned long kernelcore_node, kernelcore_remaining; |
4720 | /* save the state before borrow the nodemask */ | 4845 | /* save the state before borrow the nodemask */ |
4721 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4846 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
4722 | unsigned long totalpages = early_calculate_totalpages(); | 4847 | unsigned long totalpages = early_calculate_totalpages(); |
4723 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4848 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
4724 | 4849 | ||
4725 | /* | 4850 | /* |
4726 | * If movablecore was specified, calculate what size of | 4851 | * If movablecore was specified, calculate what size of |
@@ -4755,7 +4880,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4755 | restart: | 4880 | restart: |
4756 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4881 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
4757 | kernelcore_node = required_kernelcore / usable_nodes; | 4882 | kernelcore_node = required_kernelcore / usable_nodes; |
4758 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4883 | for_each_node_state(nid, N_MEMORY) { |
4759 | unsigned long start_pfn, end_pfn; | 4884 | unsigned long start_pfn, end_pfn; |
4760 | 4885 | ||
4761 | /* | 4886 | /* |
@@ -4847,23 +4972,27 @@ restart: | |||
4847 | 4972 | ||
4848 | out: | 4973 | out: |
4849 | /* restore the node_state */ | 4974 | /* restore the node_state */ |
4850 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4975 | node_states[N_MEMORY] = saved_node_state; |
4851 | } | 4976 | } |
4852 | 4977 | ||
4853 | /* Any regular memory on that node ? */ | 4978 | /* Any regular or high memory on that node ? */ |
4854 | static void __init check_for_regular_memory(pg_data_t *pgdat) | 4979 | static void check_for_memory(pg_data_t *pgdat, int nid) |
4855 | { | 4980 | { |
4856 | #ifdef CONFIG_HIGHMEM | ||
4857 | enum zone_type zone_type; | 4981 | enum zone_type zone_type; |
4858 | 4982 | ||
4859 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4983 | if (N_MEMORY == N_NORMAL_MEMORY) |
4984 | return; | ||
4985 | |||
4986 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | ||
4860 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4987 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4861 | if (zone->present_pages) { | 4988 | if (zone->present_pages) { |
4862 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4989 | node_set_state(nid, N_HIGH_MEMORY); |
4990 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | ||
4991 | zone_type <= ZONE_NORMAL) | ||
4992 | node_set_state(nid, N_NORMAL_MEMORY); | ||
4863 | break; | 4993 | break; |
4864 | } | 4994 | } |
4865 | } | 4995 | } |
4866 | #endif | ||
4867 | } | 4996 | } |
4868 | 4997 | ||
4869 | /** | 4998 | /** |
@@ -4946,8 +5075,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4946 | 5075 | ||
4947 | /* Any memory on that node */ | 5076 | /* Any memory on that node */ |
4948 | if (pgdat->node_present_pages) | 5077 | if (pgdat->node_present_pages) |
4949 | node_set_state(nid, N_HIGH_MEMORY); | 5078 | node_set_state(nid, N_MEMORY); |
4950 | check_for_regular_memory(pgdat); | 5079 | check_for_memory(pgdat, nid); |
4951 | } | 5080 | } |
4952 | } | 5081 | } |
4953 | 5082 | ||
@@ -5175,10 +5304,6 @@ static void __setup_per_zone_wmarks(void) | |||
5175 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5304 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5176 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5305 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5177 | 5306 | ||
5178 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | ||
5179 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | ||
5180 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | ||
5181 | |||
5182 | setup_zone_migrate_reserve(zone); | 5307 | setup_zone_migrate_reserve(zone); |
5183 | spin_unlock_irqrestore(&zone->lock, flags); | 5308 | spin_unlock_irqrestore(&zone->lock, flags); |
5184 | } | 5309 | } |
@@ -5576,7 +5701,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5576 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 5701 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't |
5577 | * expect this function should be exact. | 5702 | * expect this function should be exact. |
5578 | */ | 5703 | */ |
5579 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | 5704 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
5705 | bool skip_hwpoisoned_pages) | ||
5580 | { | 5706 | { |
5581 | unsigned long pfn, iter, found; | 5707 | unsigned long pfn, iter, found; |
5582 | int mt; | 5708 | int mt; |
@@ -5611,6 +5737,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |||
5611 | continue; | 5737 | continue; |
5612 | } | 5738 | } |
5613 | 5739 | ||
5740 | /* | ||
5741 | * The HWPoisoned page may be not in buddy system, and | ||
5742 | * page_count() is not 0. | ||
5743 | */ | ||
5744 | if (skip_hwpoisoned_pages && PageHWPoison(page)) | ||
5745 | continue; | ||
5746 | |||
5614 | if (!PageLRU(page)) | 5747 | if (!PageLRU(page)) |
5615 | found++; | 5748 | found++; |
5616 | /* | 5749 | /* |
@@ -5653,7 +5786,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5653 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5786 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5654 | return false; | 5787 | return false; |
5655 | 5788 | ||
5656 | return !has_unmovable_pages(zone, page, 0); | 5789 | return !has_unmovable_pages(zone, page, 0, true); |
5657 | } | 5790 | } |
5658 | 5791 | ||
5659 | #ifdef CONFIG_CMA | 5792 | #ifdef CONFIG_CMA |
@@ -5680,7 +5813,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5680 | unsigned int tries = 0; | 5813 | unsigned int tries = 0; |
5681 | int ret = 0; | 5814 | int ret = 0; |
5682 | 5815 | ||
5683 | migrate_prep_local(); | 5816 | migrate_prep(); |
5684 | 5817 | ||
5685 | while (pfn < end || !list_empty(&cc->migratepages)) { | 5818 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5686 | if (fatal_signal_pending(current)) { | 5819 | if (fatal_signal_pending(current)) { |
@@ -5708,61 +5841,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5708 | 5841 | ||
5709 | ret = migrate_pages(&cc->migratepages, | 5842 | ret = migrate_pages(&cc->migratepages, |
5710 | alloc_migrate_target, | 5843 | alloc_migrate_target, |
5711 | 0, false, MIGRATE_SYNC); | 5844 | 0, false, MIGRATE_SYNC, |
5845 | MR_CMA); | ||
5712 | } | 5846 | } |
5713 | 5847 | ||
5714 | putback_lru_pages(&cc->migratepages); | 5848 | putback_movable_pages(&cc->migratepages); |
5715 | return ret > 0 ? 0 : ret; | 5849 | return ret > 0 ? 0 : ret; |
5716 | } | 5850 | } |
5717 | 5851 | ||
5718 | /* | ||
5719 | * Update zone's cma pages counter used for watermark level calculation. | ||
5720 | */ | ||
5721 | static inline void __update_cma_watermarks(struct zone *zone, int count) | ||
5722 | { | ||
5723 | unsigned long flags; | ||
5724 | spin_lock_irqsave(&zone->lock, flags); | ||
5725 | zone->min_cma_pages += count; | ||
5726 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5727 | setup_per_zone_wmarks(); | ||
5728 | } | ||
5729 | |||
5730 | /* | ||
5731 | * Trigger memory pressure bump to reclaim some pages in order to be able to | ||
5732 | * allocate 'count' pages in single page units. Does similar work as | ||
5733 | *__alloc_pages_slowpath() function. | ||
5734 | */ | ||
5735 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | ||
5736 | { | ||
5737 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
5738 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | ||
5739 | int did_some_progress = 0; | ||
5740 | int order = 1; | ||
5741 | |||
5742 | /* | ||
5743 | * Increase level of watermarks to force kswapd do his job | ||
5744 | * to stabilise at new watermark level. | ||
5745 | */ | ||
5746 | __update_cma_watermarks(zone, count); | ||
5747 | |||
5748 | /* Obey watermarks as if the page was being allocated */ | ||
5749 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | ||
5750 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | ||
5751 | |||
5752 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
5753 | NULL); | ||
5754 | if (!did_some_progress) { | ||
5755 | /* Exhausted what can be done so it's blamo time */ | ||
5756 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | ||
5757 | } | ||
5758 | } | ||
5759 | |||
5760 | /* Restore original watermark levels. */ | ||
5761 | __update_cma_watermarks(zone, -count); | ||
5762 | |||
5763 | return count; | ||
5764 | } | ||
5765 | |||
5766 | /** | 5852 | /** |
5767 | * alloc_contig_range() -- tries to allocate given range of pages | 5853 | * alloc_contig_range() -- tries to allocate given range of pages |
5768 | * @start: start PFN to allocate | 5854 | * @start: start PFN to allocate |
@@ -5786,7 +5872,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | |||
5786 | int alloc_contig_range(unsigned long start, unsigned long end, | 5872 | int alloc_contig_range(unsigned long start, unsigned long end, |
5787 | unsigned migratetype) | 5873 | unsigned migratetype) |
5788 | { | 5874 | { |
5789 | struct zone *zone = page_zone(pfn_to_page(start)); | ||
5790 | unsigned long outer_start, outer_end; | 5875 | unsigned long outer_start, outer_end; |
5791 | int ret = 0, order; | 5876 | int ret = 0, order; |
5792 | 5877 | ||
@@ -5824,7 +5909,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5824 | */ | 5909 | */ |
5825 | 5910 | ||
5826 | ret = start_isolate_page_range(pfn_max_align_down(start), | 5911 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5827 | pfn_max_align_up(end), migratetype); | 5912 | pfn_max_align_up(end), migratetype, |
5913 | false); | ||
5828 | if (ret) | 5914 | if (ret) |
5829 | return ret; | 5915 | return ret; |
5830 | 5916 | ||
@@ -5863,18 +5949,13 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5863 | } | 5949 | } |
5864 | 5950 | ||
5865 | /* Make sure the range is really isolated. */ | 5951 | /* Make sure the range is really isolated. */ |
5866 | if (test_pages_isolated(outer_start, end)) { | 5952 | if (test_pages_isolated(outer_start, end, false)) { |
5867 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | 5953 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", |
5868 | outer_start, end); | 5954 | outer_start, end); |
5869 | ret = -EBUSY; | 5955 | ret = -EBUSY; |
5870 | goto done; | 5956 | goto done; |
5871 | } | 5957 | } |
5872 | 5958 | ||
5873 | /* | ||
5874 | * Reclaim enough pages to make sure that contiguous allocation | ||
5875 | * will not starve the system. | ||
5876 | */ | ||
5877 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | ||
5878 | 5959 | ||
5879 | /* Grab isolated pages from freelists. */ | 5960 | /* Grab isolated pages from freelists. */ |
5880 | outer_end = isolate_freepages_range(&cc, outer_start, end); | 5961 | outer_end = isolate_freepages_range(&cc, outer_start, end); |
@@ -5932,7 +6013,6 @@ void __meminit zone_pcp_update(struct zone *zone) | |||
5932 | } | 6013 | } |
5933 | #endif | 6014 | #endif |
5934 | 6015 | ||
5935 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5936 | void zone_pcp_reset(struct zone *zone) | 6016 | void zone_pcp_reset(struct zone *zone) |
5937 | { | 6017 | { |
5938 | unsigned long flags; | 6018 | unsigned long flags; |
@@ -5952,6 +6032,7 @@ void zone_pcp_reset(struct zone *zone) | |||
5952 | local_irq_restore(flags); | 6032 | local_irq_restore(flags); |
5953 | } | 6033 | } |
5954 | 6034 | ||
6035 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5955 | /* | 6036 | /* |
5956 | * All pages in the range must be isolated before calling this. | 6037 | * All pages in the range must be isolated before calling this. |
5957 | */ | 6038 | */ |
@@ -5978,6 +6059,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5978 | continue; | 6059 | continue; |
5979 | } | 6060 | } |
5980 | page = pfn_to_page(pfn); | 6061 | page = pfn_to_page(pfn); |
6062 | /* | ||
6063 | * The HWPoisoned page may be not in buddy system, and | ||
6064 | * page_count() is not 0. | ||
6065 | */ | ||
6066 | if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { | ||
6067 | pfn++; | ||
6068 | SetPageReserved(page); | ||
6069 | continue; | ||
6070 | } | ||
6071 | |||
5981 | BUG_ON(page_count(page)); | 6072 | BUG_ON(page_count(page)); |
5982 | BUG_ON(!PageBuddy(page)); | 6073 | BUG_ON(!PageBuddy(page)); |
5983 | order = page_order(page); | 6074 | order = page_order(page); |
@@ -5988,8 +6079,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5988 | list_del(&page->lru); | 6079 | list_del(&page->lru); |
5989 | rmv_page_order(page); | 6080 | rmv_page_order(page); |
5990 | zone->free_area[order].nr_free--; | 6081 | zone->free_area[order].nr_free--; |
5991 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
5992 | - (1UL << order)); | ||
5993 | for (i = 0; i < (1 << order); i++) | 6082 | for (i = 0; i < (1 << order); i++) |
5994 | SetPageReserved((page+i)); | 6083 | SetPageReserved((page+i)); |
5995 | pfn += (1 << order); | 6084 | pfn += (1 << order); |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5ddad0c6daa6..6d757e3a872a 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
251 | mn->nr_pages, mn->status_change_nid); | 251 | mn->nr_pages, mn->status_change_nid); |
252 | break; | 252 | break; |
253 | case MEM_CANCEL_ONLINE: | 253 | case MEM_CANCEL_ONLINE: |
254 | offline_page_cgroup(mn->start_pfn, | ||
255 | mn->nr_pages, mn->status_change_nid); | ||
256 | break; | ||
254 | case MEM_GOING_OFFLINE: | 257 | case MEM_GOING_OFFLINE: |
255 | break; | 258 | break; |
256 | case MEM_ONLINE: | 259 | case MEM_ONLINE: |
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void) | |||
271 | if (mem_cgroup_disabled()) | 274 | if (mem_cgroup_disabled()) |
272 | return; | 275 | return; |
273 | 276 | ||
274 | for_each_node_state(nid, N_HIGH_MEMORY) { | 277 | for_each_node_state(nid, N_MEMORY) { |
275 | unsigned long start_pfn, end_pfn; | 278 | unsigned long start_pfn, end_pfn; |
276 | 279 | ||
277 | start_pfn = node_start_pfn(nid); | 280 | start_pfn = node_start_pfn(nid); |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f2f5b4818e94..9d2264ea4606 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype) | |||
30 | zone->nr_pageblock_isolate--; | 30 | zone->nr_pageblock_isolate--; |
31 | } | 31 | } |
32 | 32 | ||
33 | int set_migratetype_isolate(struct page *page) | 33 | int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) |
34 | { | 34 | { |
35 | struct zone *zone; | 35 | struct zone *zone; |
36 | unsigned long flags, pfn; | 36 | unsigned long flags, pfn; |
@@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page) | |||
66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | 66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
67 | * We just check MOVABLE pages. | 67 | * We just check MOVABLE pages. |
68 | */ | 68 | */ |
69 | if (!has_unmovable_pages(zone, page, arg.pages_found)) | 69 | if (!has_unmovable_pages(zone, page, arg.pages_found, |
70 | skip_hwpoisoned_pages)) | ||
70 | ret = 0; | 71 | ret = 0; |
71 | 72 | ||
72 | /* | 73 | /* |
@@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) | |||
134 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | 135 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. |
135 | */ | 136 | */ |
136 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | 137 | int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, |
137 | unsigned migratetype) | 138 | unsigned migratetype, bool skip_hwpoisoned_pages) |
138 | { | 139 | { |
139 | unsigned long pfn; | 140 | unsigned long pfn; |
140 | unsigned long undo_pfn; | 141 | unsigned long undo_pfn; |
@@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
147 | pfn < end_pfn; | 148 | pfn < end_pfn; |
148 | pfn += pageblock_nr_pages) { | 149 | pfn += pageblock_nr_pages) { |
149 | page = __first_valid_page(pfn, pageblock_nr_pages); | 150 | page = __first_valid_page(pfn, pageblock_nr_pages); |
150 | if (page && set_migratetype_isolate(page)) { | 151 | if (page && |
152 | set_migratetype_isolate(page, skip_hwpoisoned_pages)) { | ||
151 | undo_pfn = pfn; | 153 | undo_pfn = pfn; |
152 | goto undo; | 154 | goto undo; |
153 | } | 155 | } |
@@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
190 | * Returns 1 if all pages in the range are isolated. | 192 | * Returns 1 if all pages in the range are isolated. |
191 | */ | 193 | */ |
192 | static int | 194 | static int |
193 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | 195 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, |
196 | bool skip_hwpoisoned_pages) | ||
194 | { | 197 | { |
195 | struct page *page; | 198 | struct page *page; |
196 | 199 | ||
@@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
220 | else if (page_count(page) == 0 && | 223 | else if (page_count(page) == 0 && |
221 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) | 224 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) |
222 | pfn += 1; | 225 | pfn += 1; |
226 | else if (skip_hwpoisoned_pages && PageHWPoison(page)) { | ||
227 | /* | ||
228 | * The HWPoisoned page may be not in buddy | ||
229 | * system, and page_count() is not 0. | ||
230 | */ | ||
231 | pfn++; | ||
232 | continue; | ||
233 | } | ||
223 | else | 234 | else |
224 | break; | 235 | break; |
225 | } | 236 | } |
@@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
228 | return 1; | 239 | return 1; |
229 | } | 240 | } |
230 | 241 | ||
231 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | 242 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, |
243 | bool skip_hwpoisoned_pages) | ||
232 | { | 244 | { |
233 | unsigned long pfn, flags; | 245 | unsigned long pfn, flags; |
234 | struct page *page; | 246 | struct page *page; |
@@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
251 | /* Check all pages are free or Marked as ISOLATED */ | 263 | /* Check all pages are free or Marked as ISOLATED */ |
252 | zone = page_zone(page); | 264 | zone = page_zone(page); |
253 | spin_lock_irqsave(&zone->lock, flags); | 265 | spin_lock_irqsave(&zone->lock, flags); |
254 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); | 266 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, |
267 | skip_hwpoisoned_pages); | ||
255 | spin_unlock_irqrestore(&zone->lock, flags); | 268 | spin_unlock_irqrestore(&zone->lock, flags); |
256 | return ret ? 0 : -EBUSY; | 269 | return ret ? 0 : -EBUSY; |
257 | } | 270 | } |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 6c118d012bb5..35aa294656cd 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -58,7 +58,7 @@ again: | |||
58 | if (!walk->pte_entry) | 58 | if (!walk->pte_entry) |
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); |
62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
diff --git a/mm/percpu.c b/mm/percpu.c index ddc5efb9c5bb..8c8e08f3a692 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) | |||
631 | if (!chunk) | 631 | if (!chunk) |
632 | return; | 632 | return; |
633 | pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); | 633 | pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); |
634 | kfree(chunk); | 634 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); |
635 | } | 635 | } |
636 | 636 | ||
637 | /* | 637 | /* |
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; | |||
1380 | 1380 | ||
1381 | static int __init percpu_alloc_setup(char *str) | 1381 | static int __init percpu_alloc_setup(char *str) |
1382 | { | 1382 | { |
1383 | if (!str) | ||
1384 | return -EINVAL; | ||
1385 | |||
1383 | if (0) | 1386 | if (0) |
1384 | /* nada */; | 1387 | /* nada */; |
1385 | #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK | 1388 | #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b7..0c8323fe6c8f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -12,8 +12,8 @@ | |||
12 | 12 | ||
13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | 13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS |
14 | /* | 14 | /* |
15 | * Only sets the access flags (dirty, accessed, and | 15 | * Only sets the access flags (dirty, accessed), as well as write |
16 | * writable). Furthermore, we know it always gets set to a "more | 16 | * permission. Furthermore, we know it always gets set to a "more |
17 | * permissive" setting, which allows most architectures to optimize | 17 | * permissive" setting, which allows most architectures to optimize |
18 | * this. We return whether the PTE actually changed, which in turn | 18 | * this. We return whether the PTE actually changed, which in turn |
19 | * instructs the caller to do things like update__mmu_cache. This | 19 | * instructs the caller to do things like update__mmu_cache. This |
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | |||
27 | int changed = !pte_same(*ptep, entry); | 27 | int changed = !pte_same(*ptep, entry); |
28 | if (changed) { | 28 | if (changed) { |
29 | set_pte_at(vma->vm_mm, address, ptep, entry); | 29 | set_pte_at(vma->vm_mm, address, ptep, entry); |
30 | flush_tlb_page(vma, address); | 30 | flush_tlb_fix_spurious_fault(vma, address); |
31 | } | 31 | } |
32 | return changed; | 32 | return changed; |
33 | } | 33 | } |
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
88 | { | 88 | { |
89 | pte_t pte; | 89 | pte_t pte; |
90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); | 90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); |
91 | flush_tlb_page(vma, address); | 91 | if (pte_accessible(pte)) |
92 | flush_tlb_page(vma, address); | ||
92 | return pte; | 93 | return pte; |
93 | } | 94 | } |
94 | #endif | 95 | #endif |
@@ -24,7 +24,7 @@ | |||
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
27 | * anon_vma->mutex | 27 | * anon_vma->rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
30 | * swap_lock (in swap_duplicate, swap_info_get) | 30 | * swap_lock (in swap_duplicate, swap_info_get) |
@@ -37,7 +37,7 @@ | |||
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within bdi.wb->list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |
42 | * pte map lock | 42 | * pte map lock |
43 | */ | 43 | */ |
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Synchronize against page_lock_anon_vma() such that | 90 | * Synchronize against page_lock_anon_vma_read() such that |
91 | * we can safely hold the lock without the anon_vma getting | 91 | * we can safely hold the lock without the anon_vma getting |
92 | * freed. | 92 | * freed. |
93 | * | 93 | * |
94 | * Relies on the full mb implied by the atomic_dec_and_test() from | 94 | * Relies on the full mb implied by the atomic_dec_and_test() from |
95 | * put_anon_vma() against the acquire barrier implied by | 95 | * put_anon_vma() against the acquire barrier implied by |
96 | * mutex_trylock() from page_lock_anon_vma(). This orders: | 96 | * down_read_trylock() from page_lock_anon_vma_read(). This orders: |
97 | * | 97 | * |
98 | * page_lock_anon_vma() VS put_anon_vma() | 98 | * page_lock_anon_vma_read() VS put_anon_vma() |
99 | * mutex_trylock() atomic_dec_and_test() | 99 | * down_read_trylock() atomic_dec_and_test() |
100 | * LOCK MB | 100 | * LOCK MB |
101 | * atomic_read() mutex_is_locked() | 101 | * atomic_read() rwsem_is_locked() |
102 | * | 102 | * |
103 | * LOCK should suffice since the actual taking of the lock must | 103 | * LOCK should suffice since the actual taking of the lock must |
104 | * happen _before_ what follows. | 104 | * happen _before_ what follows. |
105 | */ | 105 | */ |
106 | if (mutex_is_locked(&anon_vma->root->mutex)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock(anon_vma); |
109 | } | 109 | } |
110 | 110 | ||
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
146 | * allocate a new one. | 146 | * allocate a new one. |
147 | * | 147 | * |
148 | * Anon-vma allocations are very subtle, because we may have | 148 | * Anon-vma allocations are very subtle, because we may have |
149 | * optimistically looked up an anon_vma in page_lock_anon_vma() | 149 | * optimistically looked up an anon_vma in page_lock_anon_vma_read() |
150 | * and that may actually touch the spinlock even in the newly | 150 | * and that may actually touch the spinlock even in the newly |
151 | * allocated vma (it depends on RCU to make sure that the | 151 | * allocated vma (it depends on RCU to make sure that the |
152 | * anon_vma isn't actually destroyed). | 152 | * anon_vma isn't actually destroyed). |
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
181 | allocated = anon_vma; | 181 | allocated = anon_vma; |
182 | } | 182 | } |
183 | 183 | ||
184 | anon_vma_lock(anon_vma); | 184 | anon_vma_lock_write(anon_vma); |
185 | /* page_table_lock to protect against threads */ | 185 | /* page_table_lock to protect against threads */ |
186 | spin_lock(&mm->page_table_lock); | 186 | spin_lock(&mm->page_table_lock); |
187 | if (likely(!vma->anon_vma)) { | 187 | if (likely(!vma->anon_vma)) { |
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
219 | struct anon_vma *new_root = anon_vma->root; | 219 | struct anon_vma *new_root = anon_vma->root; |
220 | if (new_root != root) { | 220 | if (new_root != root) { |
221 | if (WARN_ON_ONCE(root)) | 221 | if (WARN_ON_ONCE(root)) |
222 | mutex_unlock(&root->mutex); | 222 | up_write(&root->rwsem); |
223 | root = new_root; | 223 | root = new_root; |
224 | mutex_lock(&root->mutex); | 224 | down_write(&root->rwsem); |
225 | } | 225 | } |
226 | return root; | 226 | return root; |
227 | } | 227 | } |
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
229 | static inline void unlock_anon_vma_root(struct anon_vma *root) | 229 | static inline void unlock_anon_vma_root(struct anon_vma *root) |
230 | { | 230 | { |
231 | if (root) | 231 | if (root) |
232 | mutex_unlock(&root->mutex); | 232 | up_write(&root->rwsem); |
233 | } | 233 | } |
234 | 234 | ||
235 | /* | 235 | /* |
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
306 | get_anon_vma(anon_vma->root); | 306 | get_anon_vma(anon_vma->root); |
307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
309 | anon_vma_lock(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock(anon_vma); |
312 | 312 | ||
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
349 | /* | 349 | /* |
350 | * Iterate the list once more, it now only contains empty and unlinked | 350 | * Iterate the list once more, it now only contains empty and unlinked |
351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | 351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() |
352 | * needing to acquire the anon_vma->root->mutex. | 352 | * needing to write-acquire the anon_vma->root->rwsem. |
353 | */ | 353 | */ |
354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
355 | struct anon_vma *anon_vma = avc->anon_vma; | 355 | struct anon_vma *anon_vma = avc->anon_vma; |
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) | |||
365 | { | 365 | { |
366 | struct anon_vma *anon_vma = data; | 366 | struct anon_vma *anon_vma = data; |
367 | 367 | ||
368 | mutex_init(&anon_vma->mutex); | 368 | init_rwsem(&anon_vma->rwsem); |
369 | atomic_set(&anon_vma->refcount, 0); | 369 | atomic_set(&anon_vma->refcount, 0); |
370 | anon_vma->rb_root = RB_ROOT; | 370 | anon_vma->rb_root = RB_ROOT; |
371 | } | 371 | } |
@@ -442,7 +442,7 @@ out: | |||
442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | 442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a |
443 | * reference like with page_get_anon_vma() and then block on the mutex. | 443 | * reference like with page_get_anon_vma() and then block on the mutex. |
444 | */ | 444 | */ |
445 | struct anon_vma *page_lock_anon_vma(struct page *page) | 445 | struct anon_vma *page_lock_anon_vma_read(struct page *page) |
446 | { | 446 | { |
447 | struct anon_vma *anon_vma = NULL; | 447 | struct anon_vma *anon_vma = NULL; |
448 | struct anon_vma *root_anon_vma; | 448 | struct anon_vma *root_anon_vma; |
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
457 | 457 | ||
458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); |
460 | if (mutex_trylock(&root_anon_vma->mutex)) { | 460 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
461 | /* | 461 | /* |
462 | * If the page is still mapped, then this anon_vma is still | 462 | * If the page is still mapped, then this anon_vma is still |
463 | * its anon_vma, and holding the mutex ensures that it will | 463 | * its anon_vma, and holding the mutex ensures that it will |
464 | * not go away, see anon_vma_free(). | 464 | * not go away, see anon_vma_free(). |
465 | */ | 465 | */ |
466 | if (!page_mapped(page)) { | 466 | if (!page_mapped(page)) { |
467 | mutex_unlock(&root_anon_vma->mutex); | 467 | up_read(&root_anon_vma->rwsem); |
468 | anon_vma = NULL; | 468 | anon_vma = NULL; |
469 | } | 469 | } |
470 | goto out; | 470 | goto out; |
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
484 | 484 | ||
485 | /* we pinned the anon_vma, its safe to sleep */ | 485 | /* we pinned the anon_vma, its safe to sleep */ |
486 | rcu_read_unlock(); | 486 | rcu_read_unlock(); |
487 | anon_vma_lock(anon_vma); | 487 | anon_vma_lock_read(anon_vma); |
488 | 488 | ||
489 | if (atomic_dec_and_test(&anon_vma->refcount)) { | 489 | if (atomic_dec_and_test(&anon_vma->refcount)) { |
490 | /* | 490 | /* |
491 | * Oops, we held the last refcount, release the lock | 491 | * Oops, we held the last refcount, release the lock |
492 | * and bail -- can't simply use put_anon_vma() because | 492 | * and bail -- can't simply use put_anon_vma() because |
493 | * we'll deadlock on the anon_vma_lock() recursion. | 493 | * we'll deadlock on the anon_vma_lock_write() recursion. |
494 | */ | 494 | */ |
495 | anon_vma_unlock(anon_vma); | 495 | anon_vma_unlock_read(anon_vma); |
496 | __put_anon_vma(anon_vma); | 496 | __put_anon_vma(anon_vma); |
497 | anon_vma = NULL; | 497 | anon_vma = NULL; |
498 | } | 498 | } |
@@ -504,9 +504,9 @@ out: | |||
504 | return anon_vma; | 504 | return anon_vma; |
505 | } | 505 | } |
506 | 506 | ||
507 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 507 | void page_unlock_anon_vma_read(struct anon_vma *anon_vma) |
508 | { | 508 | { |
509 | anon_vma_unlock(anon_vma); | 509 | anon_vma_unlock_read(anon_vma); |
510 | } | 510 | } |
511 | 511 | ||
512 | /* | 512 | /* |
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
562 | return address; | 562 | return address; |
563 | } | 563 | } |
564 | 564 | ||
565 | pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) | ||
566 | { | ||
567 | pgd_t *pgd; | ||
568 | pud_t *pud; | ||
569 | pmd_t *pmd = NULL; | ||
570 | |||
571 | pgd = pgd_offset(mm, address); | ||
572 | if (!pgd_present(*pgd)) | ||
573 | goto out; | ||
574 | |||
575 | pud = pud_offset(pgd, address); | ||
576 | if (!pud_present(*pud)) | ||
577 | goto out; | ||
578 | |||
579 | pmd = pmd_offset(pud, address); | ||
580 | if (!pmd_present(*pmd)) | ||
581 | pmd = NULL; | ||
582 | out: | ||
583 | return pmd; | ||
584 | } | ||
585 | |||
565 | /* | 586 | /* |
566 | * Check that @page is mapped at @address into @mm. | 587 | * Check that @page is mapped at @address into @mm. |
567 | * | 588 | * |
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
574 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | 595 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, |
575 | unsigned long address, spinlock_t **ptlp, int sync) | 596 | unsigned long address, spinlock_t **ptlp, int sync) |
576 | { | 597 | { |
577 | pgd_t *pgd; | ||
578 | pud_t *pud; | ||
579 | pmd_t *pmd; | 598 | pmd_t *pmd; |
580 | pte_t *pte; | 599 | pte_t *pte; |
581 | spinlock_t *ptl; | 600 | spinlock_t *ptl; |
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, | |||
586 | goto check; | 605 | goto check; |
587 | } | 606 | } |
588 | 607 | ||
589 | pgd = pgd_offset(mm, address); | 608 | pmd = mm_find_pmd(mm, address); |
590 | if (!pgd_present(*pgd)) | 609 | if (!pmd) |
591 | return NULL; | ||
592 | |||
593 | pud = pud_offset(pgd, address); | ||
594 | if (!pud_present(*pud)) | ||
595 | return NULL; | 610 | return NULL; |
596 | 611 | ||
597 | pmd = pmd_offset(pud, address); | ||
598 | if (!pmd_present(*pmd)) | ||
599 | return NULL; | ||
600 | if (pmd_trans_huge(*pmd)) | 612 | if (pmd_trans_huge(*pmd)) |
601 | return NULL; | 613 | return NULL; |
602 | 614 | ||
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page, | |||
732 | struct anon_vma_chain *avc; | 744 | struct anon_vma_chain *avc; |
733 | int referenced = 0; | 745 | int referenced = 0; |
734 | 746 | ||
735 | anon_vma = page_lock_anon_vma(page); | 747 | anon_vma = page_lock_anon_vma_read(page); |
736 | if (!anon_vma) | 748 | if (!anon_vma) |
737 | return referenced; | 749 | return referenced; |
738 | 750 | ||
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page, | |||
754 | break; | 766 | break; |
755 | } | 767 | } |
756 | 768 | ||
757 | page_unlock_anon_vma(anon_vma); | 769 | page_unlock_anon_vma_read(anon_vma); |
758 | return referenced; | 770 | return referenced; |
759 | } | 771 | } |
760 | 772 | ||
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page) | |||
1139 | * containing the swap entry, but page not yet written to swap. | 1151 | * containing the swap entry, but page not yet written to swap. |
1140 | * | 1152 | * |
1141 | * And we can skip it on file pages, so long as the filesystem | 1153 | * And we can skip it on file pages, so long as the filesystem |
1142 | * participates in dirty tracking; but need to catch shm and tmpfs | 1154 | * participates in dirty tracking (note that this is not only an |
1143 | * and ramfs pages which have been modified since creation by read | 1155 | * optimization but also solves problems caused by dirty flag in |
1144 | * fault. | 1156 | * storage key getting set by a write from inside kernel); but need to |
1157 | * catch shm and tmpfs and ramfs pages which have been modified since | ||
1158 | * creation by read fault. | ||
1145 | * | 1159 | * |
1146 | * Note that mapping must be decided above, before decrementing | 1160 | * Note that mapping must be decided above, before decrementing |
1147 | * mapcount (which luckily provides a barrier): once page is unmapped, | 1161 | * mapcount (which luckily provides a barrier): once page is unmapped, |
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1235 | update_hiwater_rss(mm); | 1249 | update_hiwater_rss(mm); |
1236 | 1250 | ||
1237 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 1251 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
1238 | if (PageAnon(page)) | 1252 | if (!PageHuge(page)) { |
1239 | dec_mm_counter(mm, MM_ANONPAGES); | 1253 | if (PageAnon(page)) |
1240 | else | 1254 | dec_mm_counter(mm, MM_ANONPAGES); |
1241 | dec_mm_counter(mm, MM_FILEPAGES); | 1255 | else |
1256 | dec_mm_counter(mm, MM_FILEPAGES); | ||
1257 | } | ||
1242 | set_pte_at(mm, address, pte, | 1258 | set_pte_at(mm, address, pte, |
1243 | swp_entry_to_pte(make_hwpoison_entry(page))); | 1259 | swp_entry_to_pte(make_hwpoison_entry(page))); |
1244 | } else if (PageAnon(page)) { | 1260 | } else if (PageAnon(page)) { |
1245 | swp_entry_t entry = { .val = page_private(page) }; | 1261 | swp_entry_t entry = { .val = page_private(page) }; |
1246 | 1262 | ||
@@ -1299,7 +1315,7 @@ out_mlock: | |||
1299 | /* | 1315 | /* |
1300 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1316 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1301 | * unstable result and race. Plus, We can't wait here because | 1317 | * unstable result and race. Plus, We can't wait here because |
1302 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. | 1318 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. |
1303 | * if trylock failed, the page remain in evictable lru and later | 1319 | * if trylock failed, the page remain in evictable lru and later |
1304 | * vmscan could retry to move the page to unevictable lru if the | 1320 | * vmscan could retry to move the page to unevictable lru if the |
1305 | * page is actually mlocked. | 1321 | * page is actually mlocked. |
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1345 | struct vm_area_struct *vma, struct page *check_page) | 1361 | struct vm_area_struct *vma, struct page *check_page) |
1346 | { | 1362 | { |
1347 | struct mm_struct *mm = vma->vm_mm; | 1363 | struct mm_struct *mm = vma->vm_mm; |
1348 | pgd_t *pgd; | ||
1349 | pud_t *pud; | ||
1350 | pmd_t *pmd; | 1364 | pmd_t *pmd; |
1351 | pte_t *pte; | 1365 | pte_t *pte; |
1352 | pte_t pteval; | 1366 | pte_t pteval; |
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1366 | if (end > vma->vm_end) | 1380 | if (end > vma->vm_end) |
1367 | end = vma->vm_end; | 1381 | end = vma->vm_end; |
1368 | 1382 | ||
1369 | pgd = pgd_offset(mm, address); | 1383 | pmd = mm_find_pmd(mm, address); |
1370 | if (!pgd_present(*pgd)) | 1384 | if (!pmd) |
1371 | return ret; | ||
1372 | |||
1373 | pud = pud_offset(pgd, address); | ||
1374 | if (!pud_present(*pud)) | ||
1375 | return ret; | ||
1376 | |||
1377 | pmd = pmd_offset(pud, address); | ||
1378 | if (!pmd_present(*pmd)) | ||
1379 | return ret; | 1385 | return ret; |
1380 | 1386 | ||
1381 | mmun_start = address; | 1387 | mmun_start = address; |
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1474 | struct anon_vma_chain *avc; | 1480 | struct anon_vma_chain *avc; |
1475 | int ret = SWAP_AGAIN; | 1481 | int ret = SWAP_AGAIN; |
1476 | 1482 | ||
1477 | anon_vma = page_lock_anon_vma(page); | 1483 | anon_vma = page_lock_anon_vma_read(page); |
1478 | if (!anon_vma) | 1484 | if (!anon_vma) |
1479 | return ret; | 1485 | return ret; |
1480 | 1486 | ||
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1501 | break; | 1507 | break; |
1502 | } | 1508 | } |
1503 | 1509 | ||
1504 | page_unlock_anon_vma(anon_vma); | 1510 | page_unlock_anon_vma_read(anon_vma); |
1505 | return ret; | 1511 | return ret; |
1506 | } | 1512 | } |
1507 | 1513 | ||
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1696 | int ret = SWAP_AGAIN; | 1702 | int ret = SWAP_AGAIN; |
1697 | 1703 | ||
1698 | /* | 1704 | /* |
1699 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | 1705 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
1700 | * because that depends on page_mapped(); but not all its usages | 1706 | * because that depends on page_mapped(); but not all its usages |
1701 | * are holding mmap_sem. Users without mmap_sem are required to | 1707 | * are holding mmap_sem. Users without mmap_sem are required to |
1702 | * take a reference count to prevent the anon_vma disappearing | 1708 | * take a reference count to prevent the anon_vma disappearing |
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1704 | anon_vma = page_anon_vma(page); | 1710 | anon_vma = page_anon_vma(page); |
1705 | if (!anon_vma) | 1711 | if (!anon_vma) |
1706 | return ret; | 1712 | return ret; |
1707 | anon_vma_lock(anon_vma); | 1713 | anon_vma_lock_read(anon_vma); |
1708 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1714 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1709 | struct vm_area_struct *vma = avc->vma; | 1715 | struct vm_area_struct *vma = avc->vma; |
1710 | unsigned long address = vma_address(page, vma); | 1716 | unsigned long address = vma_address(page, vma); |
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1712 | if (ret != SWAP_AGAIN) | 1718 | if (ret != SWAP_AGAIN) |
1713 | break; | 1719 | break; |
1714 | } | 1720 | } |
1715 | anon_vma_unlock(anon_vma); | 1721 | anon_vma_unlock_read(anon_vma); |
1716 | return ret; | 1722 | return ret; |
1717 | } | 1723 | } |
1718 | 1724 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 50c5b8f3a359..5c90d84c2b02 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1715 | return error; | 1715 | return error; |
1716 | } | 1716 | } |
1717 | 1717 | ||
1718 | /* | ||
1719 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
1720 | */ | ||
1721 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
1722 | pgoff_t index, pgoff_t end, int whence) | ||
1723 | { | ||
1724 | struct page *page; | ||
1725 | struct pagevec pvec; | ||
1726 | pgoff_t indices[PAGEVEC_SIZE]; | ||
1727 | bool done = false; | ||
1728 | int i; | ||
1729 | |||
1730 | pagevec_init(&pvec, 0); | ||
1731 | pvec.nr = 1; /* start small: we may be there already */ | ||
1732 | while (!done) { | ||
1733 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
1734 | pvec.nr, pvec.pages, indices); | ||
1735 | if (!pvec.nr) { | ||
1736 | if (whence == SEEK_DATA) | ||
1737 | index = end; | ||
1738 | break; | ||
1739 | } | ||
1740 | for (i = 0; i < pvec.nr; i++, index++) { | ||
1741 | if (index < indices[i]) { | ||
1742 | if (whence == SEEK_HOLE) { | ||
1743 | done = true; | ||
1744 | break; | ||
1745 | } | ||
1746 | index = indices[i]; | ||
1747 | } | ||
1748 | page = pvec.pages[i]; | ||
1749 | if (page && !radix_tree_exceptional_entry(page)) { | ||
1750 | if (!PageUptodate(page)) | ||
1751 | page = NULL; | ||
1752 | } | ||
1753 | if (index >= end || | ||
1754 | (page && whence == SEEK_DATA) || | ||
1755 | (!page && whence == SEEK_HOLE)) { | ||
1756 | done = true; | ||
1757 | break; | ||
1758 | } | ||
1759 | } | ||
1760 | shmem_deswap_pagevec(&pvec); | ||
1761 | pagevec_release(&pvec); | ||
1762 | pvec.nr = PAGEVEC_SIZE; | ||
1763 | cond_resched(); | ||
1764 | } | ||
1765 | return index; | ||
1766 | } | ||
1767 | |||
1768 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) | ||
1769 | { | ||
1770 | struct address_space *mapping = file->f_mapping; | ||
1771 | struct inode *inode = mapping->host; | ||
1772 | pgoff_t start, end; | ||
1773 | loff_t new_offset; | ||
1774 | |||
1775 | if (whence != SEEK_DATA && whence != SEEK_HOLE) | ||
1776 | return generic_file_llseek_size(file, offset, whence, | ||
1777 | MAX_LFS_FILESIZE, i_size_read(inode)); | ||
1778 | mutex_lock(&inode->i_mutex); | ||
1779 | /* We're holding i_mutex so we can access i_size directly */ | ||
1780 | |||
1781 | if (offset < 0) | ||
1782 | offset = -EINVAL; | ||
1783 | else if (offset >= inode->i_size) | ||
1784 | offset = -ENXIO; | ||
1785 | else { | ||
1786 | start = offset >> PAGE_CACHE_SHIFT; | ||
1787 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1788 | new_offset = shmem_seek_hole_data(mapping, start, end, whence); | ||
1789 | new_offset <<= PAGE_CACHE_SHIFT; | ||
1790 | if (new_offset > offset) { | ||
1791 | if (new_offset < inode->i_size) | ||
1792 | offset = new_offset; | ||
1793 | else if (whence == SEEK_DATA) | ||
1794 | offset = -ENXIO; | ||
1795 | else | ||
1796 | offset = inode->i_size; | ||
1797 | } | ||
1798 | } | ||
1799 | |||
1800 | if (offset >= 0 && offset != file->f_pos) { | ||
1801 | file->f_pos = offset; | ||
1802 | file->f_version = 0; | ||
1803 | } | ||
1804 | mutex_unlock(&inode->i_mutex); | ||
1805 | return offset; | ||
1806 | } | ||
1807 | |||
1718 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1808 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
1719 | loff_t len) | 1809 | loff_t len) |
1720 | { | 1810 | { |
@@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = { | |||
2586 | static const struct file_operations shmem_file_operations = { | 2676 | static const struct file_operations shmem_file_operations = { |
2587 | .mmap = shmem_mmap, | 2677 | .mmap = shmem_mmap, |
2588 | #ifdef CONFIG_TMPFS | 2678 | #ifdef CONFIG_TMPFS |
2589 | .llseek = generic_file_llseek, | 2679 | .llseek = shmem_file_llseek, |
2590 | .read = do_sync_read, | 2680 | .read = do_sync_read, |
2591 | .write = do_sync_write, | 2681 | .write = do_sync_write, |
2592 | .aio_read = shmem_file_aio_read, | 2682 | .aio_read = shmem_file_aio_read, |
@@ -87,7 +87,6 @@ | |||
87 | */ | 87 | */ |
88 | 88 | ||
89 | #include <linux/slab.h> | 89 | #include <linux/slab.h> |
90 | #include "slab.h" | ||
91 | #include <linux/mm.h> | 90 | #include <linux/mm.h> |
92 | #include <linux/poison.h> | 91 | #include <linux/poison.h> |
93 | #include <linux/swap.h> | 92 | #include <linux/swap.h> |
@@ -128,6 +127,8 @@ | |||
128 | 127 | ||
129 | #include "internal.h" | 128 | #include "internal.h" |
130 | 129 | ||
130 | #include "slab.h" | ||
131 | |||
131 | /* | 132 | /* |
132 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. | 133 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. |
133 | * 0 for faster, smaller code (especially in the critical paths). | 134 | * 0 for faster, smaller code (especially in the critical paths). |
@@ -162,23 +163,6 @@ | |||
162 | */ | 163 | */ |
163 | static bool pfmemalloc_active __read_mostly; | 164 | static bool pfmemalloc_active __read_mostly; |
164 | 165 | ||
165 | /* Legal flag mask for kmem_cache_create(). */ | ||
166 | #if DEBUG | ||
167 | # define CREATE_MASK (SLAB_RED_ZONE | \ | ||
168 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ | ||
169 | SLAB_CACHE_DMA | \ | ||
170 | SLAB_STORE_USER | \ | ||
171 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | ||
172 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | ||
173 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) | ||
174 | #else | ||
175 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | ||
176 | SLAB_CACHE_DMA | \ | ||
177 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | ||
178 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | ||
179 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) | ||
180 | #endif | ||
181 | |||
182 | /* | 166 | /* |
183 | * kmem_bufctl_t: | 167 | * kmem_bufctl_t: |
184 | * | 168 | * |
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = { | |||
564 | #undef CACHE | 548 | #undef CACHE |
565 | }; | 549 | }; |
566 | 550 | ||
567 | static struct arraycache_init initarray_cache __initdata = | ||
568 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | ||
569 | static struct arraycache_init initarray_generic = | 551 | static struct arraycache_init initarray_generic = |
570 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 552 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
571 | 553 | ||
572 | /* internal cache of cache description objs */ | 554 | /* internal cache of cache description objs */ |
573 | static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; | ||
574 | static struct kmem_cache kmem_cache_boot = { | 555 | static struct kmem_cache kmem_cache_boot = { |
575 | .nodelists = kmem_cache_nodelists, | ||
576 | .batchcount = 1, | 556 | .batchcount = 1, |
577 | .limit = BOOT_CPUCACHE_ENTRIES, | 557 | .limit = BOOT_CPUCACHE_ENTRIES, |
578 | .shared = 1, | 558 | .shared = 1, |
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q) | |||
662 | } | 642 | } |
663 | } | 643 | } |
664 | 644 | ||
645 | static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) | ||
646 | { | ||
647 | struct kmem_list3 *l3; | ||
648 | l3 = cachep->nodelists[q]; | ||
649 | if (!l3) | ||
650 | return; | ||
651 | |||
652 | slab_set_lock_classes(cachep, &on_slab_l3_key, | ||
653 | &on_slab_alc_key, q); | ||
654 | } | ||
655 | |||
656 | static inline void on_slab_lock_classes(struct kmem_cache *cachep) | ||
657 | { | ||
658 | int node; | ||
659 | |||
660 | VM_BUG_ON(OFF_SLAB(cachep)); | ||
661 | for_each_node(node) | ||
662 | on_slab_lock_classes_node(cachep, node); | ||
663 | } | ||
664 | |||
665 | static inline void init_lock_keys(void) | 665 | static inline void init_lock_keys(void) |
666 | { | 666 | { |
667 | int node; | 667 | int node; |
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void) | |||
678 | { | 678 | { |
679 | } | 679 | } |
680 | 680 | ||
681 | static inline void on_slab_lock_classes(struct kmem_cache *cachep) | ||
682 | { | ||
683 | } | ||
684 | |||
685 | static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) | ||
686 | { | ||
687 | } | ||
688 | |||
681 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | 689 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) |
682 | { | 690 | { |
683 | } | 691 | } |
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1406 | free_alien_cache(alien); | 1414 | free_alien_cache(alien); |
1407 | if (cachep->flags & SLAB_DEBUG_OBJECTS) | 1415 | if (cachep->flags & SLAB_DEBUG_OBJECTS) |
1408 | slab_set_debugobj_lock_classes_node(cachep, node); | 1416 | slab_set_debugobj_lock_classes_node(cachep, node); |
1417 | else if (!OFF_SLAB(cachep) && | ||
1418 | !(cachep->flags & SLAB_DESTROY_BY_RCU)) | ||
1419 | on_slab_lock_classes_node(cachep, node); | ||
1409 | } | 1420 | } |
1410 | init_node_lock_keys(node); | 1421 | init_node_lock_keys(node); |
1411 | 1422 | ||
@@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) | |||
1577 | } | 1588 | } |
1578 | 1589 | ||
1579 | /* | 1590 | /* |
1591 | * The memory after the last cpu cache pointer is used for the | ||
1592 | * the nodelists pointer. | ||
1593 | */ | ||
1594 | static void setup_nodelists_pointer(struct kmem_cache *cachep) | ||
1595 | { | ||
1596 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | ||
1597 | } | ||
1598 | |||
1599 | /* | ||
1580 | * Initialisation. Called after the page allocator have been initialised and | 1600 | * Initialisation. Called after the page allocator have been initialised and |
1581 | * before smp_init(). | 1601 | * before smp_init(). |
1582 | */ | 1602 | */ |
1583 | void __init kmem_cache_init(void) | 1603 | void __init kmem_cache_init(void) |
1584 | { | 1604 | { |
1585 | size_t left_over; | ||
1586 | struct cache_sizes *sizes; | 1605 | struct cache_sizes *sizes; |
1587 | struct cache_names *names; | 1606 | struct cache_names *names; |
1588 | int i; | 1607 | int i; |
1589 | int order; | ||
1590 | int node; | ||
1591 | 1608 | ||
1592 | kmem_cache = &kmem_cache_boot; | 1609 | kmem_cache = &kmem_cache_boot; |
1610 | setup_nodelists_pointer(kmem_cache); | ||
1593 | 1611 | ||
1594 | if (num_possible_nodes() == 1) | 1612 | if (num_possible_nodes() == 1) |
1595 | use_alien_caches = 0; | 1613 | use_alien_caches = 0; |
1596 | 1614 | ||
1597 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1615 | for (i = 0; i < NUM_INIT_LISTS; i++) |
1598 | kmem_list3_init(&initkmem_list3[i]); | 1616 | kmem_list3_init(&initkmem_list3[i]); |
1599 | if (i < MAX_NUMNODES) | 1617 | |
1600 | kmem_cache->nodelists[i] = NULL; | ||
1601 | } | ||
1602 | set_up_list3s(kmem_cache, CACHE_CACHE); | 1618 | set_up_list3s(kmem_cache, CACHE_CACHE); |
1603 | 1619 | ||
1604 | /* | 1620 | /* |
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void) | |||
1629 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. | 1645 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. |
1630 | */ | 1646 | */ |
1631 | 1647 | ||
1632 | node = numa_mem_id(); | ||
1633 | |||
1634 | /* 1) create the kmem_cache */ | 1648 | /* 1) create the kmem_cache */ |
1635 | INIT_LIST_HEAD(&slab_caches); | ||
1636 | list_add(&kmem_cache->list, &slab_caches); | ||
1637 | kmem_cache->colour_off = cache_line_size(); | ||
1638 | kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; | ||
1639 | kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | ||
1640 | 1649 | ||
1641 | /* | 1650 | /* |
1642 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1651 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1643 | */ | 1652 | */ |
1644 | kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1653 | create_boot_cache(kmem_cache, "kmem_cache", |
1645 | nr_node_ids * sizeof(struct kmem_list3 *); | 1654 | offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1646 | kmem_cache->object_size = kmem_cache->size; | 1655 | nr_node_ids * sizeof(struct kmem_list3 *), |
1647 | kmem_cache->size = ALIGN(kmem_cache->object_size, | 1656 | SLAB_HWCACHE_ALIGN); |
1648 | cache_line_size()); | 1657 | list_add(&kmem_cache->list, &slab_caches); |
1649 | kmem_cache->reciprocal_buffer_size = | ||
1650 | reciprocal_value(kmem_cache->size); | ||
1651 | |||
1652 | for (order = 0; order < MAX_ORDER; order++) { | ||
1653 | cache_estimate(order, kmem_cache->size, | ||
1654 | cache_line_size(), 0, &left_over, &kmem_cache->num); | ||
1655 | if (kmem_cache->num) | ||
1656 | break; | ||
1657 | } | ||
1658 | BUG_ON(!kmem_cache->num); | ||
1659 | kmem_cache->gfporder = order; | ||
1660 | kmem_cache->colour = left_over / kmem_cache->colour_off; | ||
1661 | kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + | ||
1662 | sizeof(struct slab), cache_line_size()); | ||
1663 | 1658 | ||
1664 | /* 2+3) create the kmalloc caches */ | 1659 | /* 2+3) create the kmalloc caches */ |
1665 | sizes = malloc_sizes; | 1660 | sizes = malloc_sizes; |
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void) | |||
1671 | * bug. | 1666 | * bug. |
1672 | */ | 1667 | */ |
1673 | 1668 | ||
1674 | sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 1669 | sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, |
1675 | sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; | 1670 | sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS); |
1676 | sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; | 1671 | |
1677 | sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; | 1672 | if (INDEX_AC != INDEX_L3) |
1678 | sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; | 1673 | sizes[INDEX_L3].cs_cachep = |
1679 | __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | 1674 | create_kmalloc_cache(names[INDEX_L3].name, |
1680 | list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); | 1675 | sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS); |
1681 | |||
1682 | if (INDEX_AC != INDEX_L3) { | ||
1683 | sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
1684 | sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; | ||
1685 | sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; | ||
1686 | sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; | ||
1687 | sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; | ||
1688 | __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | ||
1689 | list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); | ||
1690 | } | ||
1691 | 1676 | ||
1692 | slab_early_init = 0; | 1677 | slab_early_init = 0; |
1693 | 1678 | ||
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void) | |||
1699 | * Note for systems short on memory removing the alignment will | 1684 | * Note for systems short on memory removing the alignment will |
1700 | * allow tighter packing of the smaller caches. | 1685 | * allow tighter packing of the smaller caches. |
1701 | */ | 1686 | */ |
1702 | if (!sizes->cs_cachep) { | 1687 | if (!sizes->cs_cachep) |
1703 | sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 1688 | sizes->cs_cachep = create_kmalloc_cache(names->name, |
1704 | sizes->cs_cachep->name = names->name; | 1689 | sizes->cs_size, ARCH_KMALLOC_FLAGS); |
1705 | sizes->cs_cachep->size = sizes->cs_size; | 1690 | |
1706 | sizes->cs_cachep->object_size = sizes->cs_size; | ||
1707 | sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; | ||
1708 | __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | ||
1709 | list_add(&sizes->cs_cachep->list, &slab_caches); | ||
1710 | } | ||
1711 | #ifdef CONFIG_ZONE_DMA | 1691 | #ifdef CONFIG_ZONE_DMA |
1712 | sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 1692 | sizes->cs_dmacachep = create_kmalloc_cache( |
1713 | sizes->cs_dmacachep->name = names->name_dma; | 1693 | names->name_dma, sizes->cs_size, |
1714 | sizes->cs_dmacachep->size = sizes->cs_size; | 1694 | SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS); |
1715 | sizes->cs_dmacachep->object_size = sizes->cs_size; | ||
1716 | sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; | ||
1717 | __kmem_cache_create(sizes->cs_dmacachep, | ||
1718 | ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); | ||
1719 | list_add(&sizes->cs_dmacachep->list, &slab_caches); | ||
1720 | #endif | 1695 | #endif |
1721 | sizes++; | 1696 | sizes++; |
1722 | names++; | 1697 | names++; |
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void) | |||
1727 | 1702 | ||
1728 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | 1703 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1729 | 1704 | ||
1730 | BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); | ||
1731 | memcpy(ptr, cpu_cache_get(kmem_cache), | 1705 | memcpy(ptr, cpu_cache_get(kmem_cache), |
1732 | sizeof(struct arraycache_init)); | 1706 | sizeof(struct arraycache_init)); |
1733 | /* | 1707 | /* |
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1921 | if (page->pfmemalloc) | 1895 | if (page->pfmemalloc) |
1922 | SetPageSlabPfmemalloc(page + i); | 1896 | SetPageSlabPfmemalloc(page + i); |
1923 | } | 1897 | } |
1898 | memcg_bind_pages(cachep, cachep->gfporder); | ||
1924 | 1899 | ||
1925 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1900 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1926 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1901 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1957 | __ClearPageSlab(page); | 1932 | __ClearPageSlab(page); |
1958 | page++; | 1933 | page++; |
1959 | } | 1934 | } |
1935 | |||
1936 | memcg_release_pages(cachep, cachep->gfporder); | ||
1960 | if (current->reclaim_state) | 1937 | if (current->reclaim_state) |
1961 | current->reclaim_state->reclaimed_slab += nr_freed; | 1938 | current->reclaim_state->reclaimed_slab += nr_freed; |
1962 | free_pages((unsigned long)addr, cachep->gfporder); | 1939 | free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); |
1963 | } | 1940 | } |
1964 | 1941 | ||
1965 | static void kmem_rcu_free(struct rcu_head *head) | 1942 | static void kmem_rcu_free(struct rcu_head *head) |
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2282 | 2259 | ||
2283 | if (slab_state == DOWN) { | 2260 | if (slab_state == DOWN) { |
2284 | /* | 2261 | /* |
2285 | * Note: the first kmem_cache_create must create the cache | 2262 | * Note: Creation of first cache (kmem_cache). |
2263 | * The setup_list3s is taken care | ||
2264 | * of by the caller of __kmem_cache_create | ||
2265 | */ | ||
2266 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
2267 | slab_state = PARTIAL; | ||
2268 | } else if (slab_state == PARTIAL) { | ||
2269 | /* | ||
2270 | * Note: the second kmem_cache_create must create the cache | ||
2286 | * that's used by kmalloc(24), otherwise the creation of | 2271 | * that's used by kmalloc(24), otherwise the creation of |
2287 | * further caches will BUG(). | 2272 | * further caches will BUG(). |
2288 | */ | 2273 | */ |
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2290 | 2275 | ||
2291 | /* | 2276 | /* |
2292 | * If the cache that's used by kmalloc(sizeof(kmem_list3)) is | 2277 | * If the cache that's used by kmalloc(sizeof(kmem_list3)) is |
2293 | * the first cache, then we need to set up all its list3s, | 2278 | * the second cache, then we need to set up all its list3s, |
2294 | * otherwise the creation of further caches will BUG(). | 2279 | * otherwise the creation of further caches will BUG(). |
2295 | */ | 2280 | */ |
2296 | set_up_list3s(cachep, SIZE_AC); | 2281 | set_up_list3s(cachep, SIZE_AC); |
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2299 | else | 2284 | else |
2300 | slab_state = PARTIAL_ARRAYCACHE; | 2285 | slab_state = PARTIAL_ARRAYCACHE; |
2301 | } else { | 2286 | } else { |
2287 | /* Remaining boot caches */ | ||
2302 | cachep->array[smp_processor_id()] = | 2288 | cachep->array[smp_processor_id()] = |
2303 | kmalloc(sizeof(struct arraycache_init), gfp); | 2289 | kmalloc(sizeof(struct arraycache_init), gfp); |
2304 | 2290 | ||
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2331 | 2317 | ||
2332 | /** | 2318 | /** |
2333 | * __kmem_cache_create - Create a cache. | 2319 | * __kmem_cache_create - Create a cache. |
2334 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 2320 | * @cachep: cache management descriptor |
2335 | * @size: The size of objects to be created in this cache. | ||
2336 | * @align: The required alignment for the objects. | ||
2337 | * @flags: SLAB flags | 2321 | * @flags: SLAB flags |
2338 | * @ctor: A constructor for the objects. | ||
2339 | * | 2322 | * |
2340 | * Returns a ptr to the cache on success, NULL on failure. | 2323 | * Returns a ptr to the cache on success, NULL on failure. |
2341 | * Cannot be called within a int, but can be interrupted. | 2324 | * Cannot be called within a int, but can be interrupted. |
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2378 | if (flags & SLAB_DESTROY_BY_RCU) | 2361 | if (flags & SLAB_DESTROY_BY_RCU) |
2379 | BUG_ON(flags & SLAB_POISON); | 2362 | BUG_ON(flags & SLAB_POISON); |
2380 | #endif | 2363 | #endif |
2381 | /* | ||
2382 | * Always checks flags, a caller might be expecting debug support which | ||
2383 | * isn't available. | ||
2384 | */ | ||
2385 | BUG_ON(flags & ~CREATE_MASK); | ||
2386 | 2364 | ||
2387 | /* | 2365 | /* |
2388 | * Check that size is in terms of words. This is needed to avoid | 2366 | * Check that size is in terms of words. This is needed to avoid |
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2394 | size &= ~(BYTES_PER_WORD - 1); | 2372 | size &= ~(BYTES_PER_WORD - 1); |
2395 | } | 2373 | } |
2396 | 2374 | ||
2397 | /* calculate the final buffer alignment: */ | ||
2398 | |||
2399 | /* 1) arch recommendation: can be overridden for debug */ | ||
2400 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
2401 | /* | ||
2402 | * Default alignment: as specified by the arch code. Except if | ||
2403 | * an object is really small, then squeeze multiple objects into | ||
2404 | * one cacheline. | ||
2405 | */ | ||
2406 | ralign = cache_line_size(); | ||
2407 | while (size <= ralign / 2) | ||
2408 | ralign /= 2; | ||
2409 | } else { | ||
2410 | ralign = BYTES_PER_WORD; | ||
2411 | } | ||
2412 | |||
2413 | /* | 2375 | /* |
2414 | * Redzoning and user store require word alignment or possibly larger. | 2376 | * Redzoning and user store require word alignment or possibly larger. |
2415 | * Note this will be overridden by architecture or caller mandated | 2377 | * Note this will be overridden by architecture or caller mandated |
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2426 | size &= ~(REDZONE_ALIGN - 1); | 2388 | size &= ~(REDZONE_ALIGN - 1); |
2427 | } | 2389 | } |
2428 | 2390 | ||
2429 | /* 2) arch mandated alignment */ | ||
2430 | if (ralign < ARCH_SLAB_MINALIGN) { | ||
2431 | ralign = ARCH_SLAB_MINALIGN; | ||
2432 | } | ||
2433 | /* 3) caller mandated alignment */ | 2391 | /* 3) caller mandated alignment */ |
2434 | if (ralign < cachep->align) { | 2392 | if (ralign < cachep->align) { |
2435 | ralign = cachep->align; | 2393 | ralign = cachep->align; |
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2447 | else | 2405 | else |
2448 | gfp = GFP_NOWAIT; | 2406 | gfp = GFP_NOWAIT; |
2449 | 2407 | ||
2450 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | 2408 | setup_nodelists_pointer(cachep); |
2451 | #if DEBUG | 2409 | #if DEBUG |
2452 | 2410 | ||
2453 | /* | 2411 | /* |
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2566 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); | 2524 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); |
2567 | 2525 | ||
2568 | slab_set_debugobj_lock_classes(cachep); | 2526 | slab_set_debugobj_lock_classes(cachep); |
2569 | } | 2527 | } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) |
2528 | on_slab_lock_classes(cachep); | ||
2570 | 2529 | ||
2571 | return 0; | 2530 | return 0; |
2572 | } | 2531 | } |
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3530 | if (slab_should_failslab(cachep, flags)) | 3489 | if (slab_should_failslab(cachep, flags)) |
3531 | return NULL; | 3490 | return NULL; |
3532 | 3491 | ||
3492 | cachep = memcg_kmem_get_cache(cachep, flags); | ||
3493 | |||
3533 | cache_alloc_debugcheck_before(cachep, flags); | 3494 | cache_alloc_debugcheck_before(cachep, flags); |
3534 | local_irq_save(save_flags); | 3495 | local_irq_save(save_flags); |
3535 | 3496 | ||
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
3615 | if (slab_should_failslab(cachep, flags)) | 3576 | if (slab_should_failslab(cachep, flags)) |
3616 | return NULL; | 3577 | return NULL; |
3617 | 3578 | ||
3579 | cachep = memcg_kmem_get_cache(cachep, flags); | ||
3580 | |||
3618 | cache_alloc_debugcheck_before(cachep, flags); | 3581 | cache_alloc_debugcheck_before(cachep, flags); |
3619 | local_irq_save(save_flags); | 3582 | local_irq_save(save_flags); |
3620 | objp = __do_cache_alloc(cachep, flags); | 3583 | objp = __do_cache_alloc(cachep, flags); |
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc); | |||
3928 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) | 3891 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) |
3929 | { | 3892 | { |
3930 | unsigned long flags; | 3893 | unsigned long flags; |
3894 | cachep = cache_from_obj(cachep, objp); | ||
3895 | if (!cachep) | ||
3896 | return; | ||
3931 | 3897 | ||
3932 | local_irq_save(flags); | 3898 | local_irq_save(flags); |
3933 | debug_check_no_locks_freed(objp, cachep->object_size); | 3899 | debug_check_no_locks_freed(objp, cachep->object_size); |
@@ -3969,12 +3935,6 @@ void kfree(const void *objp) | |||
3969 | } | 3935 | } |
3970 | EXPORT_SYMBOL(kfree); | 3936 | EXPORT_SYMBOL(kfree); |
3971 | 3937 | ||
3972 | unsigned int kmem_cache_size(struct kmem_cache *cachep) | ||
3973 | { | ||
3974 | return cachep->object_size; | ||
3975 | } | ||
3976 | EXPORT_SYMBOL(kmem_cache_size); | ||
3977 | |||
3978 | /* | 3938 | /* |
3979 | * This initializes kmem_list3 or resizes various caches for all nodes. | 3939 | * This initializes kmem_list3 or resizes various caches for all nodes. |
3980 | */ | 3940 | */ |
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info) | |||
4081 | } | 4041 | } |
4082 | 4042 | ||
4083 | /* Always called with the slab_mutex held */ | 4043 | /* Always called with the slab_mutex held */ |
4084 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 4044 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, |
4085 | int batchcount, int shared, gfp_t gfp) | 4045 | int batchcount, int shared, gfp_t gfp) |
4086 | { | 4046 | { |
4087 | struct ccupdate_struct *new; | 4047 | struct ccupdate_struct *new; |
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
4124 | return alloc_kmemlist(cachep, gfp); | 4084 | return alloc_kmemlist(cachep, gfp); |
4125 | } | 4085 | } |
4126 | 4086 | ||
4087 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | ||
4088 | int batchcount, int shared, gfp_t gfp) | ||
4089 | { | ||
4090 | int ret; | ||
4091 | struct kmem_cache *c = NULL; | ||
4092 | int i = 0; | ||
4093 | |||
4094 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | ||
4095 | |||
4096 | if (slab_state < FULL) | ||
4097 | return ret; | ||
4098 | |||
4099 | if ((ret < 0) || !is_root_cache(cachep)) | ||
4100 | return ret; | ||
4101 | |||
4102 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); | ||
4103 | for_each_memcg_cache_index(i) { | ||
4104 | c = cache_from_memcg(cachep, i); | ||
4105 | if (c) | ||
4106 | /* return value determined by the parent cache only */ | ||
4107 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); | ||
4108 | } | ||
4109 | |||
4110 | return ret; | ||
4111 | } | ||
4112 | |||
4127 | /* Called with slab_mutex held always */ | 4113 | /* Called with slab_mutex held always */ |
4128 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | 4114 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
4129 | { | 4115 | { |
4130 | int err; | 4116 | int err; |
4131 | int limit, shared; | 4117 | int limit = 0; |
4118 | int shared = 0; | ||
4119 | int batchcount = 0; | ||
4120 | |||
4121 | if (!is_root_cache(cachep)) { | ||
4122 | struct kmem_cache *root = memcg_root_cache(cachep); | ||
4123 | limit = root->limit; | ||
4124 | shared = root->shared; | ||
4125 | batchcount = root->batchcount; | ||
4126 | } | ||
4132 | 4127 | ||
4128 | if (limit && shared && batchcount) | ||
4129 | goto skip_setup; | ||
4133 | /* | 4130 | /* |
4134 | * The head array serves three purposes: | 4131 | * The head array serves three purposes: |
4135 | * - create a LIFO ordering, i.e. return objects that are cache-warm | 4132 | * - create a LIFO ordering, i.e. return objects that are cache-warm |
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
4171 | if (limit > 32) | 4168 | if (limit > 32) |
4172 | limit = 32; | 4169 | limit = 32; |
4173 | #endif | 4170 | #endif |
4174 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); | 4171 | batchcount = (limit + 1) / 2; |
4172 | skip_setup: | ||
4173 | err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | ||
4175 | if (err) | 4174 | if (err) |
4176 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 4175 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
4177 | cachep->name, -err); | 4176 | cachep->name, -err); |
@@ -4276,54 +4275,8 @@ out: | |||
4276 | } | 4275 | } |
4277 | 4276 | ||
4278 | #ifdef CONFIG_SLABINFO | 4277 | #ifdef CONFIG_SLABINFO |
4279 | 4278 | void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) | |
4280 | static void print_slabinfo_header(struct seq_file *m) | ||
4281 | { | ||
4282 | /* | ||
4283 | * Output format version, so at least we can change it | ||
4284 | * without _too_ many complaints. | ||
4285 | */ | ||
4286 | #if STATS | ||
4287 | seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); | ||
4288 | #else | ||
4289 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
4290 | #endif | ||
4291 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " | ||
4292 | "<objperslab> <pagesperslab>"); | ||
4293 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
4294 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
4295 | #if STATS | ||
4296 | seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " | ||
4297 | "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); | ||
4298 | seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); | ||
4299 | #endif | ||
4300 | seq_putc(m, '\n'); | ||
4301 | } | ||
4302 | |||
4303 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
4304 | { | ||
4305 | loff_t n = *pos; | ||
4306 | |||
4307 | mutex_lock(&slab_mutex); | ||
4308 | if (!n) | ||
4309 | print_slabinfo_header(m); | ||
4310 | |||
4311 | return seq_list_start(&slab_caches, *pos); | ||
4312 | } | ||
4313 | |||
4314 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
4315 | { | 4279 | { |
4316 | return seq_list_next(p, &slab_caches, pos); | ||
4317 | } | ||
4318 | |||
4319 | static void s_stop(struct seq_file *m, void *p) | ||
4320 | { | ||
4321 | mutex_unlock(&slab_mutex); | ||
4322 | } | ||
4323 | |||
4324 | static int s_show(struct seq_file *m, void *p) | ||
4325 | { | ||
4326 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); | ||
4327 | struct slab *slabp; | 4280 | struct slab *slabp; |
4328 | unsigned long active_objs; | 4281 | unsigned long active_objs; |
4329 | unsigned long num_objs; | 4282 | unsigned long num_objs; |
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p) | |||
4378 | if (error) | 4331 | if (error) |
4379 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); | 4332 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); |
4380 | 4333 | ||
4381 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | 4334 | sinfo->active_objs = active_objs; |
4382 | name, active_objs, num_objs, cachep->size, | 4335 | sinfo->num_objs = num_objs; |
4383 | cachep->num, (1 << cachep->gfporder)); | 4336 | sinfo->active_slabs = active_slabs; |
4384 | seq_printf(m, " : tunables %4u %4u %4u", | 4337 | sinfo->num_slabs = num_slabs; |
4385 | cachep->limit, cachep->batchcount, cachep->shared); | 4338 | sinfo->shared_avail = shared_avail; |
4386 | seq_printf(m, " : slabdata %6lu %6lu %6lu", | 4339 | sinfo->limit = cachep->limit; |
4387 | active_slabs, num_slabs, shared_avail); | 4340 | sinfo->batchcount = cachep->batchcount; |
4341 | sinfo->shared = cachep->shared; | ||
4342 | sinfo->objects_per_slab = cachep->num; | ||
4343 | sinfo->cache_order = cachep->gfporder; | ||
4344 | } | ||
4345 | |||
4346 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) | ||
4347 | { | ||
4388 | #if STATS | 4348 | #if STATS |
4389 | { /* list3 stats */ | 4349 | { /* list3 stats */ |
4390 | unsigned long high = cachep->high_mark; | 4350 | unsigned long high = cachep->high_mark; |
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p) | |||
4414 | allochit, allocmiss, freehit, freemiss); | 4374 | allochit, allocmiss, freehit, freemiss); |
4415 | } | 4375 | } |
4416 | #endif | 4376 | #endif |
4417 | seq_putc(m, '\n'); | ||
4418 | return 0; | ||
4419 | } | 4377 | } |
4420 | 4378 | ||
4421 | /* | ||
4422 | * slabinfo_op - iterator that generates /proc/slabinfo | ||
4423 | * | ||
4424 | * Output layout: | ||
4425 | * cache-name | ||
4426 | * num-active-objs | ||
4427 | * total-objs | ||
4428 | * object size | ||
4429 | * num-active-slabs | ||
4430 | * total-slabs | ||
4431 | * num-pages-per-slab | ||
4432 | * + further values on SMP and with statistics enabled | ||
4433 | */ | ||
4434 | |||
4435 | static const struct seq_operations slabinfo_op = { | ||
4436 | .start = s_start, | ||
4437 | .next = s_next, | ||
4438 | .stop = s_stop, | ||
4439 | .show = s_show, | ||
4440 | }; | ||
4441 | |||
4442 | #define MAX_SLABINFO_WRITE 128 | 4379 | #define MAX_SLABINFO_WRITE 128 |
4443 | /** | 4380 | /** |
4444 | * slabinfo_write - Tuning for the slab allocator | 4381 | * slabinfo_write - Tuning for the slab allocator |
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = { | |||
4447 | * @count: data length | 4384 | * @count: data length |
4448 | * @ppos: unused | 4385 | * @ppos: unused |
4449 | */ | 4386 | */ |
4450 | static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | 4387 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
4451 | size_t count, loff_t *ppos) | 4388 | size_t count, loff_t *ppos) |
4452 | { | 4389 | { |
4453 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; | 4390 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; |
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4490 | return res; | 4427 | return res; |
4491 | } | 4428 | } |
4492 | 4429 | ||
4493 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
4494 | { | ||
4495 | return seq_open(file, &slabinfo_op); | ||
4496 | } | ||
4497 | |||
4498 | static const struct file_operations proc_slabinfo_operations = { | ||
4499 | .open = slabinfo_open, | ||
4500 | .read = seq_read, | ||
4501 | .write = slabinfo_write, | ||
4502 | .llseek = seq_lseek, | ||
4503 | .release = seq_release, | ||
4504 | }; | ||
4505 | |||
4506 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4430 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4507 | 4431 | ||
4508 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4432 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4631 | return 0; | 4555 | return 0; |
4632 | } | 4556 | } |
4633 | 4557 | ||
4558 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
4559 | { | ||
4560 | return seq_list_next(p, &slab_caches, pos); | ||
4561 | } | ||
4562 | |||
4563 | static void s_stop(struct seq_file *m, void *p) | ||
4564 | { | ||
4565 | mutex_unlock(&slab_mutex); | ||
4566 | } | ||
4567 | |||
4634 | static const struct seq_operations slabstats_op = { | 4568 | static const struct seq_operations slabstats_op = { |
4635 | .start = leaks_start, | 4569 | .start = leaks_start, |
4636 | .next = s_next, | 4570 | .next = s_next, |
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = { | |||
4665 | 4599 | ||
4666 | static int __init slab_proc_init(void) | 4600 | static int __init slab_proc_init(void) |
4667 | { | 4601 | { |
4668 | proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); | ||
4669 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4602 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4670 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); | 4603 | proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); |
4671 | #endif | 4604 | #endif |
@@ -32,19 +32,201 @@ extern struct list_head slab_caches; | |||
32 | /* The slab cache that manages slab cache information */ | 32 | /* The slab cache that manages slab cache information */ |
33 | extern struct kmem_cache *kmem_cache; | 33 | extern struct kmem_cache *kmem_cache; |
34 | 34 | ||
35 | unsigned long calculate_alignment(unsigned long flags, | ||
36 | unsigned long align, unsigned long size); | ||
37 | |||
35 | /* Functions provided by the slab allocators */ | 38 | /* Functions provided by the slab allocators */ |
36 | extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); | 39 | extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); |
37 | 40 | ||
41 | extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, | ||
42 | unsigned long flags); | ||
43 | extern void create_boot_cache(struct kmem_cache *, const char *name, | ||
44 | size_t size, unsigned long flags); | ||
45 | |||
46 | struct mem_cgroup; | ||
38 | #ifdef CONFIG_SLUB | 47 | #ifdef CONFIG_SLUB |
39 | struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | 48 | struct kmem_cache * |
40 | size_t align, unsigned long flags, void (*ctor)(void *)); | 49 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, |
50 | size_t align, unsigned long flags, void (*ctor)(void *)); | ||
41 | #else | 51 | #else |
42 | static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | 52 | static inline struct kmem_cache * |
43 | size_t align, unsigned long flags, void (*ctor)(void *)) | 53 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, |
54 | size_t align, unsigned long flags, void (*ctor)(void *)) | ||
44 | { return NULL; } | 55 | { return NULL; } |
45 | #endif | 56 | #endif |
46 | 57 | ||
47 | 58 | ||
59 | /* Legal flag mask for kmem_cache_create(), for various configurations */ | ||
60 | #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ | ||
61 | SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) | ||
62 | |||
63 | #if defined(CONFIG_DEBUG_SLAB) | ||
64 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | ||
65 | #elif defined(CONFIG_SLUB_DEBUG) | ||
66 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
67 | SLAB_TRACE | SLAB_DEBUG_FREE) | ||
68 | #else | ||
69 | #define SLAB_DEBUG_FLAGS (0) | ||
70 | #endif | ||
71 | |||
72 | #if defined(CONFIG_SLAB) | ||
73 | #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ | ||
74 | SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) | ||
75 | #elif defined(CONFIG_SLUB) | ||
76 | #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ | ||
77 | SLAB_TEMPORARY | SLAB_NOTRACK) | ||
78 | #else | ||
79 | #define SLAB_CACHE_FLAGS (0) | ||
80 | #endif | ||
81 | |||
82 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) | ||
83 | |||
48 | int __kmem_cache_shutdown(struct kmem_cache *); | 84 | int __kmem_cache_shutdown(struct kmem_cache *); |
49 | 85 | ||
86 | struct seq_file; | ||
87 | struct file; | ||
88 | |||
89 | struct slabinfo { | ||
90 | unsigned long active_objs; | ||
91 | unsigned long num_objs; | ||
92 | unsigned long active_slabs; | ||
93 | unsigned long num_slabs; | ||
94 | unsigned long shared_avail; | ||
95 | unsigned int limit; | ||
96 | unsigned int batchcount; | ||
97 | unsigned int shared; | ||
98 | unsigned int objects_per_slab; | ||
99 | unsigned int cache_order; | ||
100 | }; | ||
101 | |||
102 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); | ||
103 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); | ||
104 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | ||
105 | size_t count, loff_t *ppos); | ||
106 | |||
107 | #ifdef CONFIG_MEMCG_KMEM | ||
108 | static inline bool is_root_cache(struct kmem_cache *s) | ||
109 | { | ||
110 | return !s->memcg_params || s->memcg_params->is_root_cache; | ||
111 | } | ||
112 | |||
113 | static inline bool cache_match_memcg(struct kmem_cache *cachep, | ||
114 | struct mem_cgroup *memcg) | ||
115 | { | ||
116 | return (is_root_cache(cachep) && !memcg) || | ||
117 | (cachep->memcg_params->memcg == memcg); | ||
118 | } | ||
119 | |||
120 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | ||
121 | { | ||
122 | if (!is_root_cache(s)) | ||
123 | atomic_add(1 << order, &s->memcg_params->nr_pages); | ||
124 | } | ||
125 | |||
126 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | ||
127 | { | ||
128 | if (is_root_cache(s)) | ||
129 | return; | ||
130 | |||
131 | if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) | ||
132 | mem_cgroup_destroy_cache(s); | ||
133 | } | ||
134 | |||
135 | static inline bool slab_equal_or_root(struct kmem_cache *s, | ||
136 | struct kmem_cache *p) | ||
137 | { | ||
138 | return (p == s) || | ||
139 | (s->memcg_params && (p == s->memcg_params->root_cache)); | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * We use suffixes to the name in memcg because we can't have caches | ||
144 | * created in the system with the same name. But when we print them | ||
145 | * locally, better refer to them with the base name | ||
146 | */ | ||
147 | static inline const char *cache_name(struct kmem_cache *s) | ||
148 | { | ||
149 | if (!is_root_cache(s)) | ||
150 | return s->memcg_params->root_cache->name; | ||
151 | return s->name; | ||
152 | } | ||
153 | |||
154 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | ||
155 | { | ||
156 | return s->memcg_params->memcg_caches[idx]; | ||
157 | } | ||
158 | |||
159 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | ||
160 | { | ||
161 | if (is_root_cache(s)) | ||
162 | return s; | ||
163 | return s->memcg_params->root_cache; | ||
164 | } | ||
165 | #else | ||
166 | static inline bool is_root_cache(struct kmem_cache *s) | ||
167 | { | ||
168 | return true; | ||
169 | } | ||
170 | |||
171 | static inline bool cache_match_memcg(struct kmem_cache *cachep, | ||
172 | struct mem_cgroup *memcg) | ||
173 | { | ||
174 | return true; | ||
175 | } | ||
176 | |||
177 | static inline void memcg_bind_pages(struct kmem_cache *s, int order) | ||
178 | { | ||
179 | } | ||
180 | |||
181 | static inline void memcg_release_pages(struct kmem_cache *s, int order) | ||
182 | { | ||
183 | } | ||
184 | |||
185 | static inline bool slab_equal_or_root(struct kmem_cache *s, | ||
186 | struct kmem_cache *p) | ||
187 | { | ||
188 | return true; | ||
189 | } | ||
190 | |||
191 | static inline const char *cache_name(struct kmem_cache *s) | ||
192 | { | ||
193 | return s->name; | ||
194 | } | ||
195 | |||
196 | static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) | ||
197 | { | ||
198 | return NULL; | ||
199 | } | ||
200 | |||
201 | static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | ||
202 | { | ||
203 | return s; | ||
204 | } | ||
205 | #endif | ||
206 | |||
207 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | ||
208 | { | ||
209 | struct kmem_cache *cachep; | ||
210 | struct page *page; | ||
211 | |||
212 | /* | ||
213 | * When kmemcg is not being used, both assignments should return the | ||
214 | * same value. but we don't want to pay the assignment price in that | ||
215 | * case. If it is not compiled in, the compiler should be smart enough | ||
216 | * to not do even the assignment. In that case, slab_equal_or_root | ||
217 | * will also be a constant. | ||
218 | */ | ||
219 | if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) | ||
220 | return s; | ||
221 | |||
222 | page = virt_to_head_page(x); | ||
223 | cachep = page->slab_cache; | ||
224 | if (slab_equal_or_root(cachep, s)) | ||
225 | return cachep; | ||
226 | |||
227 | pr_err("%s: Wrong slab cache. %s but object is from %s\n", | ||
228 | __FUNCTION__, cachep->name, s->name); | ||
229 | WARN_ON_ONCE(1); | ||
230 | return s; | ||
231 | } | ||
50 | #endif | 232 | #endif |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 069a24e64403..3f3cd97d3fdf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -13,9 +13,12 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
16 | #include <linux/seq_file.h> | ||
17 | #include <linux/proc_fs.h> | ||
16 | #include <asm/cacheflush.h> | 18 | #include <asm/cacheflush.h> |
17 | #include <asm/tlbflush.h> | 19 | #include <asm/tlbflush.h> |
18 | #include <asm/page.h> | 20 | #include <asm/page.h> |
21 | #include <linux/memcontrol.h> | ||
19 | 22 | ||
20 | #include "slab.h" | 23 | #include "slab.h" |
21 | 24 | ||
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex); | |||
25 | struct kmem_cache *kmem_cache; | 28 | struct kmem_cache *kmem_cache; |
26 | 29 | ||
27 | #ifdef CONFIG_DEBUG_VM | 30 | #ifdef CONFIG_DEBUG_VM |
28 | static int kmem_cache_sanity_check(const char *name, size_t size) | 31 | static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, |
32 | size_t size) | ||
29 | { | 33 | { |
30 | struct kmem_cache *s = NULL; | 34 | struct kmem_cache *s = NULL; |
31 | 35 | ||
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size) | |||
51 | continue; | 55 | continue; |
52 | } | 56 | } |
53 | 57 | ||
54 | if (!strcmp(s->name, name)) { | 58 | /* |
59 | * For simplicity, we won't check this in the list of memcg | ||
60 | * caches. We have control over memcg naming, and if there | ||
61 | * aren't duplicates in the global list, there won't be any | ||
62 | * duplicates in the memcg lists as well. | ||
63 | */ | ||
64 | if (!memcg && !strcmp(s->name, name)) { | ||
55 | pr_err("%s (%s): Cache name already exists.\n", | 65 | pr_err("%s (%s): Cache name already exists.\n", |
56 | __func__, name); | 66 | __func__, name); |
57 | dump_stack(); | 67 | dump_stack(); |
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size) | |||
64 | return 0; | 74 | return 0; |
65 | } | 75 | } |
66 | #else | 76 | #else |
67 | static inline int kmem_cache_sanity_check(const char *name, size_t size) | 77 | static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, |
78 | const char *name, size_t size) | ||
68 | { | 79 | { |
69 | return 0; | 80 | return 0; |
70 | } | 81 | } |
71 | #endif | 82 | #endif |
72 | 83 | ||
84 | #ifdef CONFIG_MEMCG_KMEM | ||
85 | int memcg_update_all_caches(int num_memcgs) | ||
86 | { | ||
87 | struct kmem_cache *s; | ||
88 | int ret = 0; | ||
89 | mutex_lock(&slab_mutex); | ||
90 | |||
91 | list_for_each_entry(s, &slab_caches, list) { | ||
92 | if (!is_root_cache(s)) | ||
93 | continue; | ||
94 | |||
95 | ret = memcg_update_cache_size(s, num_memcgs); | ||
96 | /* | ||
97 | * See comment in memcontrol.c, memcg_update_cache_size: | ||
98 | * Instead of freeing the memory, we'll just leave the caches | ||
99 | * up to this point in an updated state. | ||
100 | */ | ||
101 | if (ret) | ||
102 | goto out; | ||
103 | } | ||
104 | |||
105 | memcg_update_array_size(num_memcgs); | ||
106 | out: | ||
107 | mutex_unlock(&slab_mutex); | ||
108 | return ret; | ||
109 | } | ||
110 | #endif | ||
111 | |||
112 | /* | ||
113 | * Figure out what the alignment of the objects will be given a set of | ||
114 | * flags, a user specified alignment and the size of the objects. | ||
115 | */ | ||
116 | unsigned long calculate_alignment(unsigned long flags, | ||
117 | unsigned long align, unsigned long size) | ||
118 | { | ||
119 | /* | ||
120 | * If the user wants hardware cache aligned objects then follow that | ||
121 | * suggestion if the object is sufficiently large. | ||
122 | * | ||
123 | * The hardware cache alignment cannot override the specified | ||
124 | * alignment though. If that is greater then use it. | ||
125 | */ | ||
126 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
127 | unsigned long ralign = cache_line_size(); | ||
128 | while (size <= ralign / 2) | ||
129 | ralign /= 2; | ||
130 | align = max(align, ralign); | ||
131 | } | ||
132 | |||
133 | if (align < ARCH_SLAB_MINALIGN) | ||
134 | align = ARCH_SLAB_MINALIGN; | ||
135 | |||
136 | return ALIGN(align, sizeof(void *)); | ||
137 | } | ||
138 | |||
139 | |||
73 | /* | 140 | /* |
74 | * kmem_cache_create - Create a cache. | 141 | * kmem_cache_create - Create a cache. |
75 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 142 | * @name: A string which is used in /proc/slabinfo to identify this cache. |
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) | |||
95 | * as davem. | 162 | * as davem. |
96 | */ | 163 | */ |
97 | 164 | ||
98 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, | 165 | struct kmem_cache * |
99 | unsigned long flags, void (*ctor)(void *)) | 166 | kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, |
167 | size_t align, unsigned long flags, void (*ctor)(void *), | ||
168 | struct kmem_cache *parent_cache) | ||
100 | { | 169 | { |
101 | struct kmem_cache *s = NULL; | 170 | struct kmem_cache *s = NULL; |
102 | int err = 0; | 171 | int err = 0; |
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align | |||
104 | get_online_cpus(); | 173 | get_online_cpus(); |
105 | mutex_lock(&slab_mutex); | 174 | mutex_lock(&slab_mutex); |
106 | 175 | ||
107 | if (!kmem_cache_sanity_check(name, size) == 0) | 176 | if (!kmem_cache_sanity_check(memcg, name, size) == 0) |
108 | goto out_locked; | 177 | goto out_locked; |
109 | 178 | ||
179 | /* | ||
180 | * Some allocators will constraint the set of valid flags to a subset | ||
181 | * of all flags. We expect them to define CACHE_CREATE_MASK in this | ||
182 | * case, and we'll just provide them with a sanitized version of the | ||
183 | * passed flags. | ||
184 | */ | ||
185 | flags &= CACHE_CREATE_MASK; | ||
110 | 186 | ||
111 | s = __kmem_cache_alias(name, size, align, flags, ctor); | 187 | s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); |
112 | if (s) | 188 | if (s) |
113 | goto out_locked; | 189 | goto out_locked; |
114 | 190 | ||
115 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); | 191 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); |
116 | if (s) { | 192 | if (s) { |
117 | s->object_size = s->size = size; | 193 | s->object_size = s->size = size; |
118 | s->align = align; | 194 | s->align = calculate_alignment(flags, align, size); |
119 | s->ctor = ctor; | 195 | s->ctor = ctor; |
196 | |||
197 | if (memcg_register_cache(memcg, s, parent_cache)) { | ||
198 | kmem_cache_free(kmem_cache, s); | ||
199 | err = -ENOMEM; | ||
200 | goto out_locked; | ||
201 | } | ||
202 | |||
120 | s->name = kstrdup(name, GFP_KERNEL); | 203 | s->name = kstrdup(name, GFP_KERNEL); |
121 | if (!s->name) { | 204 | if (!s->name) { |
122 | kmem_cache_free(kmem_cache, s); | 205 | kmem_cache_free(kmem_cache, s); |
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align | |||
126 | 209 | ||
127 | err = __kmem_cache_create(s, flags); | 210 | err = __kmem_cache_create(s, flags); |
128 | if (!err) { | 211 | if (!err) { |
129 | |||
130 | s->refcount = 1; | 212 | s->refcount = 1; |
131 | list_add(&s->list, &slab_caches); | 213 | list_add(&s->list, &slab_caches); |
132 | 214 | memcg_cache_list_add(memcg, s); | |
133 | } else { | 215 | } else { |
134 | kfree(s->name); | 216 | kfree(s->name); |
135 | kmem_cache_free(kmem_cache, s); | 217 | kmem_cache_free(kmem_cache, s); |
@@ -157,10 +239,20 @@ out_locked: | |||
157 | 239 | ||
158 | return s; | 240 | return s; |
159 | } | 241 | } |
242 | |||
243 | struct kmem_cache * | ||
244 | kmem_cache_create(const char *name, size_t size, size_t align, | ||
245 | unsigned long flags, void (*ctor)(void *)) | ||
246 | { | ||
247 | return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); | ||
248 | } | ||
160 | EXPORT_SYMBOL(kmem_cache_create); | 249 | EXPORT_SYMBOL(kmem_cache_create); |
161 | 250 | ||
162 | void kmem_cache_destroy(struct kmem_cache *s) | 251 | void kmem_cache_destroy(struct kmem_cache *s) |
163 | { | 252 | { |
253 | /* Destroy all the children caches if we aren't a memcg cache */ | ||
254 | kmem_cache_destroy_memcg_children(s); | ||
255 | |||
164 | get_online_cpus(); | 256 | get_online_cpus(); |
165 | mutex_lock(&slab_mutex); | 257 | mutex_lock(&slab_mutex); |
166 | s->refcount--; | 258 | s->refcount--; |
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
172 | if (s->flags & SLAB_DESTROY_BY_RCU) | 264 | if (s->flags & SLAB_DESTROY_BY_RCU) |
173 | rcu_barrier(); | 265 | rcu_barrier(); |
174 | 266 | ||
267 | memcg_release_cache(s); | ||
175 | kfree(s->name); | 268 | kfree(s->name); |
176 | kmem_cache_free(kmem_cache, s); | 269 | kmem_cache_free(kmem_cache, s); |
177 | } else { | 270 | } else { |
@@ -192,3 +285,182 @@ int slab_is_available(void) | |||
192 | { | 285 | { |
193 | return slab_state >= UP; | 286 | return slab_state >= UP; |
194 | } | 287 | } |
288 | |||
289 | #ifndef CONFIG_SLOB | ||
290 | /* Create a cache during boot when no slab services are available yet */ | ||
291 | void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, | ||
292 | unsigned long flags) | ||
293 | { | ||
294 | int err; | ||
295 | |||
296 | s->name = name; | ||
297 | s->size = s->object_size = size; | ||
298 | s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); | ||
299 | err = __kmem_cache_create(s, flags); | ||
300 | |||
301 | if (err) | ||
302 | panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n", | ||
303 | name, size, err); | ||
304 | |||
305 | s->refcount = -1; /* Exempt from merging for now */ | ||
306 | } | ||
307 | |||
308 | struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, | ||
309 | unsigned long flags) | ||
310 | { | ||
311 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
312 | |||
313 | if (!s) | ||
314 | panic("Out of memory when creating slab %s\n", name); | ||
315 | |||
316 | create_boot_cache(s, name, size, flags); | ||
317 | list_add(&s->list, &slab_caches); | ||
318 | s->refcount = 1; | ||
319 | return s; | ||
320 | } | ||
321 | |||
322 | #endif /* !CONFIG_SLOB */ | ||
323 | |||
324 | |||
325 | #ifdef CONFIG_SLABINFO | ||
326 | void print_slabinfo_header(struct seq_file *m) | ||
327 | { | ||
328 | /* | ||
329 | * Output format version, so at least we can change it | ||
330 | * without _too_ many complaints. | ||
331 | */ | ||
332 | #ifdef CONFIG_DEBUG_SLAB | ||
333 | seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); | ||
334 | #else | ||
335 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
336 | #endif | ||
337 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " | ||
338 | "<objperslab> <pagesperslab>"); | ||
339 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
340 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
341 | #ifdef CONFIG_DEBUG_SLAB | ||
342 | seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " | ||
343 | "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); | ||
344 | seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); | ||
345 | #endif | ||
346 | seq_putc(m, '\n'); | ||
347 | } | ||
348 | |||
349 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
350 | { | ||
351 | loff_t n = *pos; | ||
352 | |||
353 | mutex_lock(&slab_mutex); | ||
354 | if (!n) | ||
355 | print_slabinfo_header(m); | ||
356 | |||
357 | return seq_list_start(&slab_caches, *pos); | ||
358 | } | ||
359 | |||
360 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
361 | { | ||
362 | return seq_list_next(p, &slab_caches, pos); | ||
363 | } | ||
364 | |||
365 | static void s_stop(struct seq_file *m, void *p) | ||
366 | { | ||
367 | mutex_unlock(&slab_mutex); | ||
368 | } | ||
369 | |||
370 | static void | ||
371 | memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | ||
372 | { | ||
373 | struct kmem_cache *c; | ||
374 | struct slabinfo sinfo; | ||
375 | int i; | ||
376 | |||
377 | if (!is_root_cache(s)) | ||
378 | return; | ||
379 | |||
380 | for_each_memcg_cache_index(i) { | ||
381 | c = cache_from_memcg(s, i); | ||
382 | if (!c) | ||
383 | continue; | ||
384 | |||
385 | memset(&sinfo, 0, sizeof(sinfo)); | ||
386 | get_slabinfo(c, &sinfo); | ||
387 | |||
388 | info->active_slabs += sinfo.active_slabs; | ||
389 | info->num_slabs += sinfo.num_slabs; | ||
390 | info->shared_avail += sinfo.shared_avail; | ||
391 | info->active_objs += sinfo.active_objs; | ||
392 | info->num_objs += sinfo.num_objs; | ||
393 | } | ||
394 | } | ||
395 | |||
396 | int cache_show(struct kmem_cache *s, struct seq_file *m) | ||
397 | { | ||
398 | struct slabinfo sinfo; | ||
399 | |||
400 | memset(&sinfo, 0, sizeof(sinfo)); | ||
401 | get_slabinfo(s, &sinfo); | ||
402 | |||
403 | memcg_accumulate_slabinfo(s, &sinfo); | ||
404 | |||
405 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | ||
406 | cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, | ||
407 | sinfo.objects_per_slab, (1 << sinfo.cache_order)); | ||
408 | |||
409 | seq_printf(m, " : tunables %4u %4u %4u", | ||
410 | sinfo.limit, sinfo.batchcount, sinfo.shared); | ||
411 | seq_printf(m, " : slabdata %6lu %6lu %6lu", | ||
412 | sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); | ||
413 | slabinfo_show_stats(m, s); | ||
414 | seq_putc(m, '\n'); | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | static int s_show(struct seq_file *m, void *p) | ||
419 | { | ||
420 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); | ||
421 | |||
422 | if (!is_root_cache(s)) | ||
423 | return 0; | ||
424 | return cache_show(s, m); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * slabinfo_op - iterator that generates /proc/slabinfo | ||
429 | * | ||
430 | * Output layout: | ||
431 | * cache-name | ||
432 | * num-active-objs | ||
433 | * total-objs | ||
434 | * object size | ||
435 | * num-active-slabs | ||
436 | * total-slabs | ||
437 | * num-pages-per-slab | ||
438 | * + further values on SMP and with statistics enabled | ||
439 | */ | ||
440 | static const struct seq_operations slabinfo_op = { | ||
441 | .start = s_start, | ||
442 | .next = s_next, | ||
443 | .stop = s_stop, | ||
444 | .show = s_show, | ||
445 | }; | ||
446 | |||
447 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
448 | { | ||
449 | return seq_open(file, &slabinfo_op); | ||
450 | } | ||
451 | |||
452 | static const struct file_operations proc_slabinfo_operations = { | ||
453 | .open = slabinfo_open, | ||
454 | .read = seq_read, | ||
455 | .write = slabinfo_write, | ||
456 | .llseek = seq_lseek, | ||
457 | .release = seq_release, | ||
458 | }; | ||
459 | |||
460 | static int __init slab_proc_init(void) | ||
461 | { | ||
462 | proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); | ||
463 | return 0; | ||
464 | } | ||
465 | module_init(slab_proc_init); | ||
466 | #endif /* CONFIG_SLABINFO */ | ||
@@ -28,9 +28,8 @@ | |||
28 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. | 28 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. |
29 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls | 29 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls |
30 | * alloc_pages() directly, allocating compound pages so the page order | 30 | * alloc_pages() directly, allocating compound pages so the page order |
31 | * does not have to be separately tracked, and also stores the exact | 31 | * does not have to be separately tracked. |
32 | * allocation size in page->private so that it can be used to accurately | 32 | * These objects are detected in kfree() because PageSlab() |
33 | * provide ksize(). These objects are detected in kfree() because slob_page() | ||
34 | * is false for them. | 33 | * is false for them. |
35 | * | 34 | * |
36 | * SLAB is emulated on top of SLOB by simply calling constructors and | 35 | * SLAB is emulated on top of SLOB by simply calling constructors and |
@@ -59,7 +58,6 @@ | |||
59 | 58 | ||
60 | #include <linux/kernel.h> | 59 | #include <linux/kernel.h> |
61 | #include <linux/slab.h> | 60 | #include <linux/slab.h> |
62 | #include "slab.h" | ||
63 | 61 | ||
64 | #include <linux/mm.h> | 62 | #include <linux/mm.h> |
65 | #include <linux/swap.h> /* struct reclaim_state */ | 63 | #include <linux/swap.h> /* struct reclaim_state */ |
@@ -74,6 +72,7 @@ | |||
74 | 72 | ||
75 | #include <linux/atomic.h> | 73 | #include <linux/atomic.h> |
76 | 74 | ||
75 | #include "slab.h" | ||
77 | /* | 76 | /* |
78 | * slob_block has a field 'units', which indicates size of block if +ve, | 77 | * slob_block has a field 'units', which indicates size of block if +ve, |
79 | * or offset of next block if -ve (in SLOB_UNITs). | 78 | * or offset of next block if -ve (in SLOB_UNITs). |
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp) | |||
124 | 123 | ||
125 | #define SLOB_UNIT sizeof(slob_t) | 124 | #define SLOB_UNIT sizeof(slob_t) |
126 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) | 125 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) |
127 | #define SLOB_ALIGN L1_CACHE_BYTES | ||
128 | 126 | ||
129 | /* | 127 | /* |
130 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | 128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which |
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) | |||
455 | if (likely(order)) | 453 | if (likely(order)) |
456 | gfp |= __GFP_COMP; | 454 | gfp |= __GFP_COMP; |
457 | ret = slob_new_pages(gfp, order, node); | 455 | ret = slob_new_pages(gfp, order, node); |
458 | if (ret) { | ||
459 | struct page *page; | ||
460 | page = virt_to_page(ret); | ||
461 | page->private = size; | ||
462 | } | ||
463 | 456 | ||
464 | trace_kmalloc_node(caller, ret, | 457 | trace_kmalloc_node(caller, ret, |
465 | size, PAGE_SIZE << order, gfp, node); | 458 | size, PAGE_SIZE << order, gfp, node); |
@@ -506,7 +499,7 @@ void kfree(const void *block) | |||
506 | unsigned int *m = (unsigned int *)(block - align); | 499 | unsigned int *m = (unsigned int *)(block - align); |
507 | slob_free(m, *m + align); | 500 | slob_free(m, *m + align); |
508 | } else | 501 | } else |
509 | put_page(sp); | 502 | __free_pages(sp, compound_order(sp)); |
510 | } | 503 | } |
511 | EXPORT_SYMBOL(kfree); | 504 | EXPORT_SYMBOL(kfree); |
512 | 505 | ||
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree); | |||
514 | size_t ksize(const void *block) | 507 | size_t ksize(const void *block) |
515 | { | 508 | { |
516 | struct page *sp; | 509 | struct page *sp; |
510 | int align; | ||
511 | unsigned int *m; | ||
517 | 512 | ||
518 | BUG_ON(!block); | 513 | BUG_ON(!block); |
519 | if (unlikely(block == ZERO_SIZE_PTR)) | 514 | if (unlikely(block == ZERO_SIZE_PTR)) |
520 | return 0; | 515 | return 0; |
521 | 516 | ||
522 | sp = virt_to_page(block); | 517 | sp = virt_to_page(block); |
523 | if (PageSlab(sp)) { | 518 | if (unlikely(!PageSlab(sp))) |
524 | int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 519 | return PAGE_SIZE << compound_order(sp); |
525 | unsigned int *m = (unsigned int *)(block - align); | 520 | |
526 | return SLOB_UNITS(*m) * SLOB_UNIT; | 521 | align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
527 | } else | 522 | m = (unsigned int *)(block - align); |
528 | return sp->private; | 523 | return SLOB_UNITS(*m) * SLOB_UNIT; |
529 | } | 524 | } |
530 | EXPORT_SYMBOL(ksize); | 525 | EXPORT_SYMBOL(ksize); |
531 | 526 | ||
532 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) | 527 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) |
533 | { | 528 | { |
534 | size_t align = c->size; | ||
535 | |||
536 | if (flags & SLAB_DESTROY_BY_RCU) { | 529 | if (flags & SLAB_DESTROY_BY_RCU) { |
537 | /* leave room for rcu footer at the end of object */ | 530 | /* leave room for rcu footer at the end of object */ |
538 | c->size += sizeof(struct slob_rcu); | 531 | c->size += sizeof(struct slob_rcu); |
539 | } | 532 | } |
540 | c->flags = flags; | 533 | c->flags = flags; |
541 | /* ignore alignment unless it's forced */ | ||
542 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | ||
543 | if (c->align < ARCH_SLAB_MINALIGN) | ||
544 | c->align = ARCH_SLAB_MINALIGN; | ||
545 | if (c->align < align) | ||
546 | c->align = align; | ||
547 | |||
548 | return 0; | 534 | return 0; |
549 | } | 535 | } |
550 | 536 | ||
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
558 | 544 | ||
559 | if (c->size < PAGE_SIZE) { | 545 | if (c->size < PAGE_SIZE) { |
560 | b = slob_alloc(c->size, flags, c->align, node); | 546 | b = slob_alloc(c->size, flags, c->align, node); |
561 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 547 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, |
562 | SLOB_UNITS(c->size) * SLOB_UNIT, | 548 | SLOB_UNITS(c->size) * SLOB_UNIT, |
563 | flags, node); | 549 | flags, node); |
564 | } else { | 550 | } else { |
565 | b = slob_new_pages(flags, get_order(c->size), node); | 551 | b = slob_new_pages(flags, get_order(c->size), node); |
566 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 552 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, |
567 | PAGE_SIZE << get_order(c->size), | 553 | PAGE_SIZE << get_order(c->size), |
568 | flags, node); | 554 | flags, node); |
569 | } | 555 | } |
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) | |||
608 | } | 594 | } |
609 | EXPORT_SYMBOL(kmem_cache_free); | 595 | EXPORT_SYMBOL(kmem_cache_free); |
610 | 596 | ||
611 | unsigned int kmem_cache_size(struct kmem_cache *c) | ||
612 | { | ||
613 | return c->size; | ||
614 | } | ||
615 | EXPORT_SYMBOL(kmem_cache_size); | ||
616 | |||
617 | int __kmem_cache_shutdown(struct kmem_cache *c) | 597 | int __kmem_cache_shutdown(struct kmem_cache *c) |
618 | { | 598 | { |
619 | /* No way to check for remaining objects */ | 599 | /* No way to check for remaining objects */ |
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/fault-inject.h> | 31 | #include <linux/fault-inject.h> |
32 | #include <linux/stacktrace.h> | 32 | #include <linux/stacktrace.h> |
33 | #include <linux/prefetch.h> | 33 | #include <linux/prefetch.h> |
34 | #include <linux/memcontrol.h> | ||
34 | 35 | ||
35 | #include <trace/events/kmem.h> | 36 | #include <trace/events/kmem.h> |
36 | 37 | ||
@@ -112,9 +113,6 @@ | |||
112 | * the fast path and disables lockless freelists. | 113 | * the fast path and disables lockless freelists. |
113 | */ | 114 | */ |
114 | 115 | ||
115 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
116 | SLAB_TRACE | SLAB_DEBUG_FREE) | ||
117 | |||
118 | static inline int kmem_cache_debug(struct kmem_cache *s) | 116 | static inline int kmem_cache_debug(struct kmem_cache *s) |
119 | { | 117 | { |
120 | #ifdef CONFIG_SLUB_DEBUG | 118 | #ifdef CONFIG_SLUB_DEBUG |
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
179 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 177 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ |
180 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | 178 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ |
181 | 179 | ||
182 | static int kmem_size = sizeof(struct kmem_cache); | ||
183 | |||
184 | #ifdef CONFIG_SMP | 180 | #ifdef CONFIG_SMP |
185 | static struct notifier_block slab_notifier; | 181 | static struct notifier_block slab_notifier; |
186 | #endif | 182 | #endif |
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; | |||
205 | static int sysfs_slab_add(struct kmem_cache *); | 201 | static int sysfs_slab_add(struct kmem_cache *); |
206 | static int sysfs_slab_alias(struct kmem_cache *, const char *); | 202 | static int sysfs_slab_alias(struct kmem_cache *, const char *); |
207 | static void sysfs_slab_remove(struct kmem_cache *); | 203 | static void sysfs_slab_remove(struct kmem_cache *); |
208 | 204 | static void memcg_propagate_slab_attrs(struct kmem_cache *s); | |
209 | #else | 205 | #else |
210 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } | 206 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } |
211 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) | 207 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) |
212 | { return 0; } | 208 | { return 0; } |
213 | static inline void sysfs_slab_remove(struct kmem_cache *s) { } | 209 | static inline void sysfs_slab_remove(struct kmem_cache *s) { } |
214 | 210 | ||
211 | static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } | ||
215 | #endif | 212 | #endif |
216 | 213 | ||
217 | static inline void stat(const struct kmem_cache *s, enum stat_item si) | 214 | static inline void stat(const struct kmem_cache *s, enum stat_item si) |
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing( | |||
1092 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1089 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1093 | goto out; | 1090 | goto out; |
1094 | 1091 | ||
1095 | if (unlikely(s != page->slab)) { | 1092 | if (unlikely(s != page->slab_cache)) { |
1096 | if (!PageSlab(page)) { | 1093 | if (!PageSlab(page)) { |
1097 | slab_err(s, page, "Attempt to free object(0x%p) " | 1094 | slab_err(s, page, "Attempt to free object(0x%p) " |
1098 | "outside of slab", object); | 1095 | "outside of slab", object); |
1099 | } else if (!page->slab) { | 1096 | } else if (!page->slab_cache) { |
1100 | printk(KERN_ERR | 1097 | printk(KERN_ERR |
1101 | "SLUB <none>: no slab for object 0x%p.\n", | 1098 | "SLUB <none>: no slab for object 0x%p.\n", |
1102 | object); | 1099 | object); |
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1348 | void *start; | 1345 | void *start; |
1349 | void *last; | 1346 | void *last; |
1350 | void *p; | 1347 | void *p; |
1348 | int order; | ||
1351 | 1349 | ||
1352 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 1350 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
1353 | 1351 | ||
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1356 | if (!page) | 1354 | if (!page) |
1357 | goto out; | 1355 | goto out; |
1358 | 1356 | ||
1357 | order = compound_order(page); | ||
1359 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1358 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1360 | page->slab = s; | 1359 | memcg_bind_pages(s, order); |
1360 | page->slab_cache = s; | ||
1361 | __SetPageSlab(page); | 1361 | __SetPageSlab(page); |
1362 | if (page->pfmemalloc) | 1362 | if (page->pfmemalloc) |
1363 | SetPageSlabPfmemalloc(page); | 1363 | SetPageSlabPfmemalloc(page); |
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1365 | start = page_address(page); | 1365 | start = page_address(page); |
1366 | 1366 | ||
1367 | if (unlikely(s->flags & SLAB_POISON)) | 1367 | if (unlikely(s->flags & SLAB_POISON)) |
1368 | memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); | 1368 | memset(start, POISON_INUSE, PAGE_SIZE << order); |
1369 | 1369 | ||
1370 | last = start; | 1370 | last = start; |
1371 | for_each_object(p, s, start, page->objects) { | 1371 | for_each_object(p, s, start, page->objects) { |
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1406 | 1406 | ||
1407 | __ClearPageSlabPfmemalloc(page); | 1407 | __ClearPageSlabPfmemalloc(page); |
1408 | __ClearPageSlab(page); | 1408 | __ClearPageSlab(page); |
1409 | |||
1410 | memcg_release_pages(s, order); | ||
1409 | reset_page_mapcount(page); | 1411 | reset_page_mapcount(page); |
1410 | if (current->reclaim_state) | 1412 | if (current->reclaim_state) |
1411 | current->reclaim_state->reclaimed_slab += pages; | 1413 | current->reclaim_state->reclaimed_slab += pages; |
1412 | __free_pages(page, order); | 1414 | __free_memcg_kmem_pages(page, order); |
1413 | } | 1415 | } |
1414 | 1416 | ||
1415 | #define need_reserve_slab_rcu \ | 1417 | #define need_reserve_slab_rcu \ |
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h) | |||
1424 | else | 1426 | else |
1425 | page = container_of((struct list_head *)h, struct page, lru); | 1427 | page = container_of((struct list_head *)h, struct page, lru); |
1426 | 1428 | ||
1427 | __free_slab(page->slab, page); | 1429 | __free_slab(page->slab_cache, page); |
1428 | } | 1430 | } |
1429 | 1431 | ||
1430 | static void free_slab(struct kmem_cache *s, struct page *page) | 1432 | static void free_slab(struct kmem_cache *s, struct page *page) |
@@ -1872,12 +1874,14 @@ redo: | |||
1872 | /* | 1874 | /* |
1873 | * Unfreeze all the cpu partial slabs. | 1875 | * Unfreeze all the cpu partial slabs. |
1874 | * | 1876 | * |
1875 | * This function must be called with interrupt disabled. | 1877 | * This function must be called with interrupts disabled |
1878 | * for the cpu using c (or some other guarantee must be there | ||
1879 | * to guarantee no concurrent accesses). | ||
1876 | */ | 1880 | */ |
1877 | static void unfreeze_partials(struct kmem_cache *s) | 1881 | static void unfreeze_partials(struct kmem_cache *s, |
1882 | struct kmem_cache_cpu *c) | ||
1878 | { | 1883 | { |
1879 | struct kmem_cache_node *n = NULL, *n2 = NULL; | 1884 | struct kmem_cache_node *n = NULL, *n2 = NULL; |
1880 | struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); | ||
1881 | struct page *page, *discard_page = NULL; | 1885 | struct page *page, *discard_page = NULL; |
1882 | 1886 | ||
1883 | while ((page = c->partial)) { | 1887 | while ((page = c->partial)) { |
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
1963 | * set to the per node partial list. | 1967 | * set to the per node partial list. |
1964 | */ | 1968 | */ |
1965 | local_irq_save(flags); | 1969 | local_irq_save(flags); |
1966 | unfreeze_partials(s); | 1970 | unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); |
1967 | local_irq_restore(flags); | 1971 | local_irq_restore(flags); |
1968 | oldpage = NULL; | 1972 | oldpage = NULL; |
1969 | pobjects = 0; | 1973 | pobjects = 0; |
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | |||
2006 | if (c->page) | 2010 | if (c->page) |
2007 | flush_slab(s, c); | 2011 | flush_slab(s, c); |
2008 | 2012 | ||
2009 | unfreeze_partials(s); | 2013 | unfreeze_partials(s, c); |
2010 | } | 2014 | } |
2011 | } | 2015 | } |
2012 | 2016 | ||
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, | |||
2325 | if (slab_pre_alloc_hook(s, gfpflags)) | 2329 | if (slab_pre_alloc_hook(s, gfpflags)) |
2326 | return NULL; | 2330 | return NULL; |
2327 | 2331 | ||
2332 | s = memcg_kmem_get_cache(s, gfpflags); | ||
2328 | redo: | 2333 | redo: |
2329 | 2334 | ||
2330 | /* | 2335 | /* |
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2459 | void *prior; | 2464 | void *prior; |
2460 | void **object = (void *)x; | 2465 | void **object = (void *)x; |
2461 | int was_frozen; | 2466 | int was_frozen; |
2462 | int inuse; | ||
2463 | struct page new; | 2467 | struct page new; |
2464 | unsigned long counters; | 2468 | unsigned long counters; |
2465 | struct kmem_cache_node *n = NULL; | 2469 | struct kmem_cache_node *n = NULL; |
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2472 | return; | 2476 | return; |
2473 | 2477 | ||
2474 | do { | 2478 | do { |
2479 | if (unlikely(n)) { | ||
2480 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2481 | n = NULL; | ||
2482 | } | ||
2475 | prior = page->freelist; | 2483 | prior = page->freelist; |
2476 | counters = page->counters; | 2484 | counters = page->counters; |
2477 | set_freepointer(s, object, prior); | 2485 | set_freepointer(s, object, prior); |
2478 | new.counters = counters; | 2486 | new.counters = counters; |
2479 | was_frozen = new.frozen; | 2487 | was_frozen = new.frozen; |
2480 | new.inuse--; | 2488 | new.inuse--; |
2481 | if ((!new.inuse || !prior) && !was_frozen && !n) { | 2489 | if ((!new.inuse || !prior) && !was_frozen) { |
2482 | 2490 | ||
2483 | if (!kmem_cache_debug(s) && !prior) | 2491 | if (!kmem_cache_debug(s) && !prior) |
2484 | 2492 | ||
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2503 | 2511 | ||
2504 | } | 2512 | } |
2505 | } | 2513 | } |
2506 | inuse = new.inuse; | ||
2507 | 2514 | ||
2508 | } while (!cmpxchg_double_slab(s, page, | 2515 | } while (!cmpxchg_double_slab(s, page, |
2509 | prior, counters, | 2516 | prior, counters, |
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2529 | return; | 2536 | return; |
2530 | } | 2537 | } |
2531 | 2538 | ||
2539 | if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) | ||
2540 | goto slab_empty; | ||
2541 | |||
2532 | /* | 2542 | /* |
2533 | * was_frozen may have been set after we acquired the list_lock in | 2543 | * Objects left in the slab. If it was not on the partial list before |
2534 | * an earlier loop. So we need to check it here again. | 2544 | * then add it. |
2535 | */ | 2545 | */ |
2536 | if (was_frozen) | 2546 | if (kmem_cache_debug(s) && unlikely(!prior)) { |
2537 | stat(s, FREE_FROZEN); | 2547 | remove_full(s, page); |
2538 | else { | 2548 | add_partial(n, page, DEACTIVATE_TO_TAIL); |
2539 | if (unlikely(!inuse && n->nr_partial > s->min_partial)) | 2549 | stat(s, FREE_ADD_PARTIAL); |
2540 | goto slab_empty; | ||
2541 | |||
2542 | /* | ||
2543 | * Objects left in the slab. If it was not on the partial list before | ||
2544 | * then add it. | ||
2545 | */ | ||
2546 | if (unlikely(!prior)) { | ||
2547 | remove_full(s, page); | ||
2548 | add_partial(n, page, DEACTIVATE_TO_TAIL); | ||
2549 | stat(s, FREE_ADD_PARTIAL); | ||
2550 | } | ||
2551 | } | 2550 | } |
2552 | spin_unlock_irqrestore(&n->list_lock, flags); | 2551 | spin_unlock_irqrestore(&n->list_lock, flags); |
2553 | return; | 2552 | return; |
@@ -2619,19 +2618,10 @@ redo: | |||
2619 | 2618 | ||
2620 | void kmem_cache_free(struct kmem_cache *s, void *x) | 2619 | void kmem_cache_free(struct kmem_cache *s, void *x) |
2621 | { | 2620 | { |
2622 | struct page *page; | 2621 | s = cache_from_obj(s, x); |
2623 | 2622 | if (!s) | |
2624 | page = virt_to_head_page(x); | ||
2625 | |||
2626 | if (kmem_cache_debug(s) && page->slab != s) { | ||
2627 | pr_err("kmem_cache_free: Wrong slab cache. %s but object" | ||
2628 | " is from %s\n", page->slab->name, s->name); | ||
2629 | WARN_ON_ONCE(1); | ||
2630 | return; | 2623 | return; |
2631 | } | 2624 | slab_free(s, virt_to_head_page(x), x, _RET_IP_); |
2632 | |||
2633 | slab_free(s, page, x, _RET_IP_); | ||
2634 | |||
2635 | trace_kmem_cache_free(_RET_IP_, x); | 2625 | trace_kmem_cache_free(_RET_IP_, x); |
2636 | } | 2626 | } |
2637 | EXPORT_SYMBOL(kmem_cache_free); | 2627 | EXPORT_SYMBOL(kmem_cache_free); |
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved) | |||
2769 | return -ENOSYS; | 2759 | return -ENOSYS; |
2770 | } | 2760 | } |
2771 | 2761 | ||
2772 | /* | ||
2773 | * Figure out what the alignment of the objects will be. | ||
2774 | */ | ||
2775 | static unsigned long calculate_alignment(unsigned long flags, | ||
2776 | unsigned long align, unsigned long size) | ||
2777 | { | ||
2778 | /* | ||
2779 | * If the user wants hardware cache aligned objects then follow that | ||
2780 | * suggestion if the object is sufficiently large. | ||
2781 | * | ||
2782 | * The hardware cache alignment cannot override the specified | ||
2783 | * alignment though. If that is greater then use it. | ||
2784 | */ | ||
2785 | if (flags & SLAB_HWCACHE_ALIGN) { | ||
2786 | unsigned long ralign = cache_line_size(); | ||
2787 | while (size <= ralign / 2) | ||
2788 | ralign /= 2; | ||
2789 | align = max(align, ralign); | ||
2790 | } | ||
2791 | |||
2792 | if (align < ARCH_SLAB_MINALIGN) | ||
2793 | align = ARCH_SLAB_MINALIGN; | ||
2794 | |||
2795 | return ALIGN(align, sizeof(void *)); | ||
2796 | } | ||
2797 | |||
2798 | static void | 2762 | static void |
2799 | init_kmem_cache_node(struct kmem_cache_node *n) | 2763 | init_kmem_cache_node(struct kmem_cache_node *n) |
2800 | { | 2764 | { |
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2928 | { | 2892 | { |
2929 | unsigned long flags = s->flags; | 2893 | unsigned long flags = s->flags; |
2930 | unsigned long size = s->object_size; | 2894 | unsigned long size = s->object_size; |
2931 | unsigned long align = s->align; | ||
2932 | int order; | 2895 | int order; |
2933 | 2896 | ||
2934 | /* | 2897 | /* |
@@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3000 | #endif | 2963 | #endif |
3001 | 2964 | ||
3002 | /* | 2965 | /* |
3003 | * Determine the alignment based on various parameters that the | ||
3004 | * user specified and the dynamic determination of cache line size | ||
3005 | * on bootup. | ||
3006 | */ | ||
3007 | align = calculate_alignment(flags, align, s->object_size); | ||
3008 | s->align = align; | ||
3009 | |||
3010 | /* | ||
3011 | * SLUB stores one object immediately after another beginning from | 2966 | * SLUB stores one object immediately after another beginning from |
3012 | * offset 0. In order to align the objects we have to simply size | 2967 | * offset 0. In order to align the objects we have to simply size |
3013 | * each object to conform to the alignment. | 2968 | * each object to conform to the alignment. |
3014 | */ | 2969 | */ |
3015 | size = ALIGN(size, align); | 2970 | size = ALIGN(size, s->align); |
3016 | s->size = size; | 2971 | s->size = size; |
3017 | if (forced_order >= 0) | 2972 | if (forced_order >= 0) |
3018 | order = forced_order; | 2973 | order = forced_order; |
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3041 | s->max = s->oo; | 2996 | s->max = s->oo; |
3042 | 2997 | ||
3043 | return !!oo_objects(s->oo); | 2998 | return !!oo_objects(s->oo); |
3044 | |||
3045 | } | 2999 | } |
3046 | 3000 | ||
3047 | static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | 3001 | static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) |
@@ -3127,15 +3081,6 @@ error: | |||
3127 | return -EINVAL; | 3081 | return -EINVAL; |
3128 | } | 3082 | } |
3129 | 3083 | ||
3130 | /* | ||
3131 | * Determine the size of a slab object | ||
3132 | */ | ||
3133 | unsigned int kmem_cache_size(struct kmem_cache *s) | ||
3134 | { | ||
3135 | return s->object_size; | ||
3136 | } | ||
3137 | EXPORT_SYMBOL(kmem_cache_size); | ||
3138 | |||
3139 | static void list_slab_objects(struct kmem_cache *s, struct page *page, | 3084 | static void list_slab_objects(struct kmem_cache *s, struct page *page, |
3140 | const char *text) | 3085 | const char *text) |
3141 | { | 3086 | { |
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s) | |||
3208 | { | 3153 | { |
3209 | int rc = kmem_cache_close(s); | 3154 | int rc = kmem_cache_close(s); |
3210 | 3155 | ||
3211 | if (!rc) | 3156 | if (!rc) { |
3157 | /* | ||
3158 | * We do the same lock strategy around sysfs_slab_add, see | ||
3159 | * __kmem_cache_create. Because this is pretty much the last | ||
3160 | * operation we do and the lock will be released shortly after | ||
3161 | * that in slab_common.c, we could just move sysfs_slab_remove | ||
3162 | * to a later point in common code. We should do that when we | ||
3163 | * have a common sysfs framework for all allocators. | ||
3164 | */ | ||
3165 | mutex_unlock(&slab_mutex); | ||
3212 | sysfs_slab_remove(s); | 3166 | sysfs_slab_remove(s); |
3167 | mutex_lock(&slab_mutex); | ||
3168 | } | ||
3213 | 3169 | ||
3214 | return rc; | 3170 | return rc; |
3215 | } | 3171 | } |
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str) | |||
3261 | 3217 | ||
3262 | __setup("slub_nomerge", setup_slub_nomerge); | 3218 | __setup("slub_nomerge", setup_slub_nomerge); |
3263 | 3219 | ||
3264 | static struct kmem_cache *__init create_kmalloc_cache(const char *name, | ||
3265 | int size, unsigned int flags) | ||
3266 | { | ||
3267 | struct kmem_cache *s; | ||
3268 | |||
3269 | s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
3270 | |||
3271 | s->name = name; | ||
3272 | s->size = s->object_size = size; | ||
3273 | s->align = ARCH_KMALLOC_MINALIGN; | ||
3274 | |||
3275 | /* | ||
3276 | * This function is called with IRQs disabled during early-boot on | ||
3277 | * single CPU so there's no need to take slab_mutex here. | ||
3278 | */ | ||
3279 | if (kmem_cache_open(s, flags)) | ||
3280 | goto panic; | ||
3281 | |||
3282 | list_add(&s->list, &slab_caches); | ||
3283 | return s; | ||
3284 | |||
3285 | panic: | ||
3286 | panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); | ||
3287 | return NULL; | ||
3288 | } | ||
3289 | |||
3290 | /* | 3220 | /* |
3291 | * Conversion table for small slabs sizes / 8 to the index in the | 3221 | * Conversion table for small slabs sizes / 8 to the index in the |
3292 | * kmalloc array. This is necessary for slabs < 192 since we have non power | 3222 | * kmalloc array. This is necessary for slabs < 192 since we have non power |
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |||
3372 | struct page *page; | 3302 | struct page *page; |
3373 | void *ptr = NULL; | 3303 | void *ptr = NULL; |
3374 | 3304 | ||
3375 | flags |= __GFP_COMP | __GFP_NOTRACK; | 3305 | flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; |
3376 | page = alloc_pages_node(node, flags, get_order(size)); | 3306 | page = alloc_pages_node(node, flags, get_order(size)); |
3377 | if (page) | 3307 | if (page) |
3378 | ptr = page_address(page); | 3308 | ptr = page_address(page); |
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object) | |||
3424 | return PAGE_SIZE << compound_order(page); | 3354 | return PAGE_SIZE << compound_order(page); |
3425 | } | 3355 | } |
3426 | 3356 | ||
3427 | return slab_ksize(page->slab); | 3357 | return slab_ksize(page->slab_cache); |
3428 | } | 3358 | } |
3429 | EXPORT_SYMBOL(ksize); | 3359 | EXPORT_SYMBOL(ksize); |
3430 | 3360 | ||
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x) | |||
3449 | } | 3379 | } |
3450 | 3380 | ||
3451 | slab_lock(page); | 3381 | slab_lock(page); |
3452 | if (on_freelist(page->slab, page, object)) { | 3382 | if (on_freelist(page->slab_cache, page, object)) { |
3453 | object_err(page->slab, page, object, "Object is on free-list"); | 3383 | object_err(page->slab_cache, page, object, "Object is on free-list"); |
3454 | rv = false; | 3384 | rv = false; |
3455 | } else { | 3385 | } else { |
3456 | rv = true; | 3386 | rv = true; |
@@ -3478,10 +3408,10 @@ void kfree(const void *x) | |||
3478 | if (unlikely(!PageSlab(page))) { | 3408 | if (unlikely(!PageSlab(page))) { |
3479 | BUG_ON(!PageCompound(page)); | 3409 | BUG_ON(!PageCompound(page)); |
3480 | kmemleak_free(x); | 3410 | kmemleak_free(x); |
3481 | __free_pages(page, compound_order(page)); | 3411 | __free_memcg_kmem_pages(page, compound_order(page)); |
3482 | return; | 3412 | return; |
3483 | } | 3413 | } |
3484 | slab_free(page->slab, page, object, _RET_IP_); | 3414 | slab_free(page->slab_cache, page, object, _RET_IP_); |
3485 | } | 3415 | } |
3486 | EXPORT_SYMBOL(kfree); | 3416 | EXPORT_SYMBOL(kfree); |
3487 | 3417 | ||
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg) | |||
3573 | struct memory_notify *marg = arg; | 3503 | struct memory_notify *marg = arg; |
3574 | int offline_node; | 3504 | int offline_node; |
3575 | 3505 | ||
3576 | offline_node = marg->status_change_nid; | 3506 | offline_node = marg->status_change_nid_normal; |
3577 | 3507 | ||
3578 | /* | 3508 | /* |
3579 | * If the node still has available memory. we need kmem_cache_node | 3509 | * If the node still has available memory. we need kmem_cache_node |
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
3606 | struct kmem_cache_node *n; | 3536 | struct kmem_cache_node *n; |
3607 | struct kmem_cache *s; | 3537 | struct kmem_cache *s; |
3608 | struct memory_notify *marg = arg; | 3538 | struct memory_notify *marg = arg; |
3609 | int nid = marg->status_change_nid; | 3539 | int nid = marg->status_change_nid_normal; |
3610 | int ret = 0; | 3540 | int ret = 0; |
3611 | 3541 | ||
3612 | /* | 3542 | /* |
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self, | |||
3676 | 3606 | ||
3677 | /* | 3607 | /* |
3678 | * Used for early kmem_cache structures that were allocated using | 3608 | * Used for early kmem_cache structures that were allocated using |
3679 | * the page allocator | 3609 | * the page allocator. Allocate them properly then fix up the pointers |
3610 | * that may be pointing to the wrong kmem_cache structure. | ||
3680 | */ | 3611 | */ |
3681 | 3612 | ||
3682 | static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) | 3613 | static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) |
3683 | { | 3614 | { |
3684 | int node; | 3615 | int node; |
3616 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | ||
3685 | 3617 | ||
3686 | list_add(&s->list, &slab_caches); | 3618 | memcpy(s, static_cache, kmem_cache->object_size); |
3687 | s->refcount = -1; | ||
3688 | 3619 | ||
3689 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3620 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3690 | struct kmem_cache_node *n = get_node(s, node); | 3621 | struct kmem_cache_node *n = get_node(s, node); |
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) | |||
3692 | 3623 | ||
3693 | if (n) { | 3624 | if (n) { |
3694 | list_for_each_entry(p, &n->partial, lru) | 3625 | list_for_each_entry(p, &n->partial, lru) |
3695 | p->slab = s; | 3626 | p->slab_cache = s; |
3696 | 3627 | ||
3697 | #ifdef CONFIG_SLUB_DEBUG | 3628 | #ifdef CONFIG_SLUB_DEBUG |
3698 | list_for_each_entry(p, &n->full, lru) | 3629 | list_for_each_entry(p, &n->full, lru) |
3699 | p->slab = s; | 3630 | p->slab_cache = s; |
3700 | #endif | 3631 | #endif |
3701 | } | 3632 | } |
3702 | } | 3633 | } |
3634 | list_add(&s->list, &slab_caches); | ||
3635 | return s; | ||
3703 | } | 3636 | } |
3704 | 3637 | ||
3705 | void __init kmem_cache_init(void) | 3638 | void __init kmem_cache_init(void) |
3706 | { | 3639 | { |
3640 | static __initdata struct kmem_cache boot_kmem_cache, | ||
3641 | boot_kmem_cache_node; | ||
3707 | int i; | 3642 | int i; |
3708 | int caches = 0; | 3643 | int caches = 2; |
3709 | struct kmem_cache *temp_kmem_cache; | ||
3710 | int order; | ||
3711 | struct kmem_cache *temp_kmem_cache_node; | ||
3712 | unsigned long kmalloc_size; | ||
3713 | 3644 | ||
3714 | if (debug_guardpage_minorder()) | 3645 | if (debug_guardpage_minorder()) |
3715 | slub_max_order = 0; | 3646 | slub_max_order = 0; |
3716 | 3647 | ||
3717 | kmem_size = offsetof(struct kmem_cache, node) + | 3648 | kmem_cache_node = &boot_kmem_cache_node; |
3718 | nr_node_ids * sizeof(struct kmem_cache_node *); | 3649 | kmem_cache = &boot_kmem_cache; |
3719 | |||
3720 | /* Allocate two kmem_caches from the page allocator */ | ||
3721 | kmalloc_size = ALIGN(kmem_size, cache_line_size()); | ||
3722 | order = get_order(2 * kmalloc_size); | ||
3723 | kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); | ||
3724 | |||
3725 | /* | ||
3726 | * Must first have the slab cache available for the allocations of the | ||
3727 | * struct kmem_cache_node's. There is special bootstrap code in | ||
3728 | * kmem_cache_open for slab_state == DOWN. | ||
3729 | */ | ||
3730 | kmem_cache_node = (void *)kmem_cache + kmalloc_size; | ||
3731 | 3650 | ||
3732 | kmem_cache_node->name = "kmem_cache_node"; | 3651 | create_boot_cache(kmem_cache_node, "kmem_cache_node", |
3733 | kmem_cache_node->size = kmem_cache_node->object_size = | 3652 | sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); |
3734 | sizeof(struct kmem_cache_node); | ||
3735 | kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); | ||
3736 | 3653 | ||
3737 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); | 3654 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); |
3738 | 3655 | ||
3739 | /* Able to allocate the per node structures */ | 3656 | /* Able to allocate the per node structures */ |
3740 | slab_state = PARTIAL; | 3657 | slab_state = PARTIAL; |
3741 | 3658 | ||
3742 | temp_kmem_cache = kmem_cache; | 3659 | create_boot_cache(kmem_cache, "kmem_cache", |
3743 | kmem_cache->name = "kmem_cache"; | 3660 | offsetof(struct kmem_cache, node) + |
3744 | kmem_cache->size = kmem_cache->object_size = kmem_size; | 3661 | nr_node_ids * sizeof(struct kmem_cache_node *), |
3745 | kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 3662 | SLAB_HWCACHE_ALIGN); |
3746 | 3663 | ||
3747 | kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); | 3664 | kmem_cache = bootstrap(&boot_kmem_cache); |
3748 | memcpy(kmem_cache, temp_kmem_cache, kmem_size); | ||
3749 | 3665 | ||
3750 | /* | 3666 | /* |
3751 | * Allocate kmem_cache_node properly from the kmem_cache slab. | 3667 | * Allocate kmem_cache_node properly from the kmem_cache slab. |
3752 | * kmem_cache_node is separately allocated so no need to | 3668 | * kmem_cache_node is separately allocated so no need to |
3753 | * update any list pointers. | 3669 | * update any list pointers. |
3754 | */ | 3670 | */ |
3755 | temp_kmem_cache_node = kmem_cache_node; | 3671 | kmem_cache_node = bootstrap(&boot_kmem_cache_node); |
3756 | |||
3757 | kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); | ||
3758 | memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); | ||
3759 | |||
3760 | kmem_cache_bootstrap_fixup(kmem_cache_node); | ||
3761 | |||
3762 | caches++; | ||
3763 | kmem_cache_bootstrap_fixup(kmem_cache); | ||
3764 | caches++; | ||
3765 | /* Free temporary boot structure */ | ||
3766 | free_pages((unsigned long)temp_kmem_cache, order); | ||
3767 | 3672 | ||
3768 | /* Now we can use the kmem_cache to allocate kmalloc slabs */ | 3673 | /* Now we can use the kmem_cache to allocate kmalloc slabs */ |
3769 | 3674 | ||
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
3891 | return 0; | 3796 | return 0; |
3892 | } | 3797 | } |
3893 | 3798 | ||
3894 | static struct kmem_cache *find_mergeable(size_t size, | 3799 | static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, |
3895 | size_t align, unsigned long flags, const char *name, | 3800 | size_t align, unsigned long flags, const char *name, |
3896 | void (*ctor)(void *)) | 3801 | void (*ctor)(void *)) |
3897 | { | 3802 | { |
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
3927 | if (s->size - size >= sizeof(void *)) | 3832 | if (s->size - size >= sizeof(void *)) |
3928 | continue; | 3833 | continue; |
3929 | 3834 | ||
3835 | if (!cache_match_memcg(s, memcg)) | ||
3836 | continue; | ||
3837 | |||
3930 | return s; | 3838 | return s; |
3931 | } | 3839 | } |
3932 | return NULL; | 3840 | return NULL; |
3933 | } | 3841 | } |
3934 | 3842 | ||
3935 | struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | 3843 | struct kmem_cache * |
3936 | size_t align, unsigned long flags, void (*ctor)(void *)) | 3844 | __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, |
3845 | size_t align, unsigned long flags, void (*ctor)(void *)) | ||
3937 | { | 3846 | { |
3938 | struct kmem_cache *s; | 3847 | struct kmem_cache *s; |
3939 | 3848 | ||
3940 | s = find_mergeable(size, align, flags, name, ctor); | 3849 | s = find_mergeable(memcg, size, align, flags, name, ctor); |
3941 | if (s) { | 3850 | if (s) { |
3942 | s->refcount++; | 3851 | s->refcount++; |
3943 | /* | 3852 | /* |
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) | |||
3964 | if (err) | 3873 | if (err) |
3965 | return err; | 3874 | return err; |
3966 | 3875 | ||
3876 | /* Mutex is not taken during early boot */ | ||
3877 | if (slab_state <= UP) | ||
3878 | return 0; | ||
3879 | |||
3880 | memcg_propagate_slab_attrs(s); | ||
3967 | mutex_unlock(&slab_mutex); | 3881 | mutex_unlock(&slab_mutex); |
3968 | err = sysfs_slab_add(s); | 3882 | err = sysfs_slab_add(s); |
3969 | mutex_lock(&slab_mutex); | 3883 | mutex_lock(&slab_mutex); |
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
5197 | return -EIO; | 5111 | return -EIO; |
5198 | 5112 | ||
5199 | err = attribute->store(s, buf, len); | 5113 | err = attribute->store(s, buf, len); |
5114 | #ifdef CONFIG_MEMCG_KMEM | ||
5115 | if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { | ||
5116 | int i; | ||
5117 | |||
5118 | mutex_lock(&slab_mutex); | ||
5119 | if (s->max_attr_size < len) | ||
5120 | s->max_attr_size = len; | ||
5200 | 5121 | ||
5122 | /* | ||
5123 | * This is a best effort propagation, so this function's return | ||
5124 | * value will be determined by the parent cache only. This is | ||
5125 | * basically because not all attributes will have a well | ||
5126 | * defined semantics for rollbacks - most of the actions will | ||
5127 | * have permanent effects. | ||
5128 | * | ||
5129 | * Returning the error value of any of the children that fail | ||
5130 | * is not 100 % defined, in the sense that users seeing the | ||
5131 | * error code won't be able to know anything about the state of | ||
5132 | * the cache. | ||
5133 | * | ||
5134 | * Only returning the error code for the parent cache at least | ||
5135 | * has well defined semantics. The cache being written to | ||
5136 | * directly either failed or succeeded, in which case we loop | ||
5137 | * through the descendants with best-effort propagation. | ||
5138 | */ | ||
5139 | for_each_memcg_cache_index(i) { | ||
5140 | struct kmem_cache *c = cache_from_memcg(s, i); | ||
5141 | if (c) | ||
5142 | attribute->store(c, buf, len); | ||
5143 | } | ||
5144 | mutex_unlock(&slab_mutex); | ||
5145 | } | ||
5146 | #endif | ||
5201 | return err; | 5147 | return err; |
5202 | } | 5148 | } |
5203 | 5149 | ||
5150 | static void memcg_propagate_slab_attrs(struct kmem_cache *s) | ||
5151 | { | ||
5152 | #ifdef CONFIG_MEMCG_KMEM | ||
5153 | int i; | ||
5154 | char *buffer = NULL; | ||
5155 | |||
5156 | if (!is_root_cache(s)) | ||
5157 | return; | ||
5158 | |||
5159 | /* | ||
5160 | * This mean this cache had no attribute written. Therefore, no point | ||
5161 | * in copying default values around | ||
5162 | */ | ||
5163 | if (!s->max_attr_size) | ||
5164 | return; | ||
5165 | |||
5166 | for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { | ||
5167 | char mbuf[64]; | ||
5168 | char *buf; | ||
5169 | struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); | ||
5170 | |||
5171 | if (!attr || !attr->store || !attr->show) | ||
5172 | continue; | ||
5173 | |||
5174 | /* | ||
5175 | * It is really bad that we have to allocate here, so we will | ||
5176 | * do it only as a fallback. If we actually allocate, though, | ||
5177 | * we can just use the allocated buffer until the end. | ||
5178 | * | ||
5179 | * Most of the slub attributes will tend to be very small in | ||
5180 | * size, but sysfs allows buffers up to a page, so they can | ||
5181 | * theoretically happen. | ||
5182 | */ | ||
5183 | if (buffer) | ||
5184 | buf = buffer; | ||
5185 | else if (s->max_attr_size < ARRAY_SIZE(mbuf)) | ||
5186 | buf = mbuf; | ||
5187 | else { | ||
5188 | buffer = (char *) get_zeroed_page(GFP_KERNEL); | ||
5189 | if (WARN_ON(!buffer)) | ||
5190 | continue; | ||
5191 | buf = buffer; | ||
5192 | } | ||
5193 | |||
5194 | attr->show(s->memcg_params->root_cache, buf); | ||
5195 | attr->store(s, buf, strlen(buf)); | ||
5196 | } | ||
5197 | |||
5198 | if (buffer) | ||
5199 | free_page((unsigned long)buffer); | ||
5200 | #endif | ||
5201 | } | ||
5202 | |||
5204 | static const struct sysfs_ops slab_sysfs_ops = { | 5203 | static const struct sysfs_ops slab_sysfs_ops = { |
5205 | .show = slab_attr_show, | 5204 | .show = slab_attr_show, |
5206 | .store = slab_attr_store, | 5205 | .store = slab_attr_store, |
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s) | |||
5257 | if (p != name + 1) | 5256 | if (p != name + 1) |
5258 | *p++ = '-'; | 5257 | *p++ = '-'; |
5259 | p += sprintf(p, "%07d", s->size); | 5258 | p += sprintf(p, "%07d", s->size); |
5259 | |||
5260 | #ifdef CONFIG_MEMCG_KMEM | ||
5261 | if (!is_root_cache(s)) | ||
5262 | p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); | ||
5263 | #endif | ||
5264 | |||
5260 | BUG_ON(p > name + ID_STR_LENGTH - 1); | 5265 | BUG_ON(p > name + ID_STR_LENGTH - 1); |
5261 | return name; | 5266 | return name; |
5262 | } | 5267 | } |
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
5265 | { | 5270 | { |
5266 | int err; | 5271 | int err; |
5267 | const char *name; | 5272 | const char *name; |
5268 | int unmergeable; | 5273 | int unmergeable = slab_unmergeable(s); |
5269 | |||
5270 | if (slab_state < FULL) | ||
5271 | /* Defer until later */ | ||
5272 | return 0; | ||
5273 | 5274 | ||
5274 | unmergeable = slab_unmergeable(s); | ||
5275 | if (unmergeable) { | 5275 | if (unmergeable) { |
5276 | /* | 5276 | /* |
5277 | * Slabcache can never be merged so we can use the name proper. | 5277 | * Slabcache can never be merged so we can use the name proper. |
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init); | |||
5405 | * The /proc/slabinfo ABI | 5405 | * The /proc/slabinfo ABI |
5406 | */ | 5406 | */ |
5407 | #ifdef CONFIG_SLABINFO | 5407 | #ifdef CONFIG_SLABINFO |
5408 | static void print_slabinfo_header(struct seq_file *m) | 5408 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) |
5409 | { | ||
5410 | seq_puts(m, "slabinfo - version: 2.1\n"); | ||
5411 | seq_puts(m, "# name <active_objs> <num_objs> <object_size> " | ||
5412 | "<objperslab> <pagesperslab>"); | ||
5413 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | ||
5414 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
5415 | seq_putc(m, '\n'); | ||
5416 | } | ||
5417 | |||
5418 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
5419 | { | ||
5420 | loff_t n = *pos; | ||
5421 | |||
5422 | mutex_lock(&slab_mutex); | ||
5423 | if (!n) | ||
5424 | print_slabinfo_header(m); | ||
5425 | |||
5426 | return seq_list_start(&slab_caches, *pos); | ||
5427 | } | ||
5428 | |||
5429 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
5430 | { | ||
5431 | return seq_list_next(p, &slab_caches, pos); | ||
5432 | } | ||
5433 | |||
5434 | static void s_stop(struct seq_file *m, void *p) | ||
5435 | { | ||
5436 | mutex_unlock(&slab_mutex); | ||
5437 | } | ||
5438 | |||
5439 | static int s_show(struct seq_file *m, void *p) | ||
5440 | { | 5409 | { |
5441 | unsigned long nr_partials = 0; | 5410 | unsigned long nr_partials = 0; |
5442 | unsigned long nr_slabs = 0; | 5411 | unsigned long nr_slabs = 0; |
5443 | unsigned long nr_inuse = 0; | ||
5444 | unsigned long nr_objs = 0; | 5412 | unsigned long nr_objs = 0; |
5445 | unsigned long nr_free = 0; | 5413 | unsigned long nr_free = 0; |
5446 | struct kmem_cache *s; | ||
5447 | int node; | 5414 | int node; |
5448 | 5415 | ||
5449 | s = list_entry(p, struct kmem_cache, list); | ||
5450 | |||
5451 | for_each_online_node(node) { | 5416 | for_each_online_node(node) { |
5452 | struct kmem_cache_node *n = get_node(s, node); | 5417 | struct kmem_cache_node *n = get_node(s, node); |
5453 | 5418 | ||
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p) | |||
5460 | nr_free += count_partial(n, count_free); | 5425 | nr_free += count_partial(n, count_free); |
5461 | } | 5426 | } |
5462 | 5427 | ||
5463 | nr_inuse = nr_objs - nr_free; | 5428 | sinfo->active_objs = nr_objs - nr_free; |
5464 | 5429 | sinfo->num_objs = nr_objs; | |
5465 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, | 5430 | sinfo->active_slabs = nr_slabs; |
5466 | nr_objs, s->size, oo_objects(s->oo), | 5431 | sinfo->num_slabs = nr_slabs; |
5467 | (1 << oo_order(s->oo))); | 5432 | sinfo->objects_per_slab = oo_objects(s->oo); |
5468 | seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); | 5433 | sinfo->cache_order = oo_order(s->oo); |
5469 | seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, | ||
5470 | 0UL); | ||
5471 | seq_putc(m, '\n'); | ||
5472 | return 0; | ||
5473 | } | 5434 | } |
5474 | 5435 | ||
5475 | static const struct seq_operations slabinfo_op = { | 5436 | void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) |
5476 | .start = s_start, | ||
5477 | .next = s_next, | ||
5478 | .stop = s_stop, | ||
5479 | .show = s_show, | ||
5480 | }; | ||
5481 | |||
5482 | static int slabinfo_open(struct inode *inode, struct file *file) | ||
5483 | { | 5437 | { |
5484 | return seq_open(file, &slabinfo_op); | ||
5485 | } | 5438 | } |
5486 | 5439 | ||
5487 | static const struct file_operations proc_slabinfo_operations = { | 5440 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
5488 | .open = slabinfo_open, | 5441 | size_t count, loff_t *ppos) |
5489 | .read = seq_read, | ||
5490 | .llseek = seq_lseek, | ||
5491 | .release = seq_release, | ||
5492 | }; | ||
5493 | |||
5494 | static int __init slab_proc_init(void) | ||
5495 | { | 5442 | { |
5496 | proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); | 5443 | return -EIO; |
5497 | return 0; | ||
5498 | } | 5444 | } |
5499 | module_init(slab_proc_init); | ||
5500 | #endif /* CONFIG_SLABINFO */ | 5445 | #endif /* CONFIG_SLABINFO */ |
diff --git a/mm/sparse.c b/mm/sparse.c index a83de2f72b30..6b5fb762e2ca 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | |||
638 | got_map_page: | 638 | got_map_page: |
639 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); | 639 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); |
640 | got_map_ptr: | 640 | got_map_ptr: |
641 | memset(ret, 0, memmap_size); | ||
642 | 641 | ||
643 | return ret; | 642 | return ret; |
644 | } | 643 | } |
@@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
758 | goto out; | 757 | goto out; |
759 | } | 758 | } |
760 | 759 | ||
760 | memset(memmap, 0, sizeof(struct page) * nr_pages); | ||
761 | |||
761 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 762 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
762 | 763 | ||
763 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); | 764 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); |
@@ -771,6 +772,27 @@ out: | |||
771 | return ret; | 772 | return ret; |
772 | } | 773 | } |
773 | 774 | ||
775 | #ifdef CONFIG_MEMORY_FAILURE | ||
776 | static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | ||
777 | { | ||
778 | int i; | ||
779 | |||
780 | if (!memmap) | ||
781 | return; | ||
782 | |||
783 | for (i = 0; i < PAGES_PER_SECTION; i++) { | ||
784 | if (PageHWPoison(&memmap[i])) { | ||
785 | atomic_long_sub(1, &mce_bad_pages); | ||
786 | ClearPageHWPoison(&memmap[i]); | ||
787 | } | ||
788 | } | ||
789 | } | ||
790 | #else | ||
791 | static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | ||
792 | { | ||
793 | } | ||
794 | #endif | ||
795 | |||
774 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 796 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
775 | { | 797 | { |
776 | struct page *memmap = NULL; | 798 | struct page *memmap = NULL; |
@@ -784,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
784 | ms->pageblock_flags = NULL; | 806 | ms->pageblock_flags = NULL; |
785 | } | 807 | } |
786 | 808 | ||
809 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | ||
787 | free_section_usemap(memmap, usemap); | 810 | free_section_usemap(memmap, usemap); |
788 | } | 811 | } |
789 | #endif | 812 | #endif |
diff --git a/mm/swapfile.c b/mm/swapfile.c index f91a25547ffe..e97a0e5aea91 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1443 | return generic_swapfile_activate(sis, swap_file, span); | 1443 | return generic_swapfile_activate(sis, swap_file, span); |
1444 | } | 1444 | } |
1445 | 1445 | ||
1446 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1446 | static void _enable_swap_info(struct swap_info_struct *p, int prio, |
1447 | unsigned char *swap_map, | 1447 | unsigned char *swap_map, |
1448 | unsigned long *frontswap_map) | 1448 | unsigned long *frontswap_map) |
1449 | { | 1449 | { |
1450 | int i, prev; | 1450 | int i, prev; |
1451 | 1451 | ||
1452 | spin_lock(&swap_lock); | ||
1453 | if (prio >= 0) | 1452 | if (prio >= 0) |
1454 | p->prio = prio; | 1453 | p->prio = prio; |
1455 | else | 1454 | else |
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1472 | swap_list.head = swap_list.next = p->type; | 1471 | swap_list.head = swap_list.next = p->type; |
1473 | else | 1472 | else |
1474 | swap_info[prev]->next = p->type; | 1473 | swap_info[prev]->next = p->type; |
1474 | } | ||
1475 | |||
1476 | static void enable_swap_info(struct swap_info_struct *p, int prio, | ||
1477 | unsigned char *swap_map, | ||
1478 | unsigned long *frontswap_map) | ||
1479 | { | ||
1480 | spin_lock(&swap_lock); | ||
1481 | _enable_swap_info(p, prio, swap_map, frontswap_map); | ||
1475 | frontswap_init(p->type); | 1482 | frontswap_init(p->type); |
1476 | spin_unlock(&swap_lock); | 1483 | spin_unlock(&swap_lock); |
1477 | } | 1484 | } |
1478 | 1485 | ||
1486 | static void reinsert_swap_info(struct swap_info_struct *p) | ||
1487 | { | ||
1488 | spin_lock(&swap_lock); | ||
1489 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | ||
1490 | spin_unlock(&swap_lock); | ||
1491 | } | ||
1492 | |||
1479 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1493 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
1480 | { | 1494 | { |
1481 | struct swap_info_struct *p = NULL; | 1495 | struct swap_info_struct *p = NULL; |
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1484 | struct address_space *mapping; | 1498 | struct address_space *mapping; |
1485 | struct inode *inode; | 1499 | struct inode *inode; |
1486 | struct filename *pathname; | 1500 | struct filename *pathname; |
1487 | int oom_score_adj; | ||
1488 | int i, type, prev; | 1501 | int i, type, prev; |
1489 | int err; | 1502 | int err; |
1490 | 1503 | ||
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1543 | p->flags &= ~SWP_WRITEOK; | 1556 | p->flags &= ~SWP_WRITEOK; |
1544 | spin_unlock(&swap_lock); | 1557 | spin_unlock(&swap_lock); |
1545 | 1558 | ||
1546 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1559 | set_current_oom_origin(); |
1547 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | 1560 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
1548 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1561 | clear_current_oom_origin(); |
1549 | 1562 | ||
1550 | if (err) { | 1563 | if (err) { |
1551 | /* | ||
1552 | * reading p->prio and p->swap_map outside the lock is | ||
1553 | * safe here because only sys_swapon and sys_swapoff | ||
1554 | * change them, and there can be no other sys_swapon or | ||
1555 | * sys_swapoff for this swap_info_struct at this point. | ||
1556 | */ | ||
1557 | /* re-insert swap space back into swap_list */ | 1564 | /* re-insert swap space back into swap_list */ |
1558 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | 1565 | reinsert_swap_info(p); |
1559 | goto out_dput; | 1566 | goto out_dput; |
1560 | } | 1567 | } |
1561 | 1568 | ||
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc); | |||
152 | * | 152 | * |
153 | * The contents of the object pointed to are preserved up to the | 153 | * The contents of the object pointed to are preserved up to the |
154 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | 154 | * lesser of the new and old sizes. If @p is %NULL, krealloc() |
155 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | 155 | * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a |
156 | * %NULL pointer, the object pointed to is freed. | 156 | * %NULL pointer, the object pointed to is freed. |
157 | */ | 157 | */ |
158 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 158 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78e08300db21..5123a169ab7b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p) | |||
2550 | 2550 | ||
2551 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) | 2551 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) |
2552 | { | 2552 | { |
2553 | if (NUMA_BUILD) { | 2553 | if (IS_ENABLED(CONFIG_NUMA)) { |
2554 | unsigned int nr, *counters = m->private; | 2554 | unsigned int nr, *counters = m->private; |
2555 | 2555 | ||
2556 | if (!counters) | 2556 | if (!counters) |
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file) | |||
2615 | unsigned int *ptr = NULL; | 2615 | unsigned int *ptr = NULL; |
2616 | int ret; | 2616 | int ret; |
2617 | 2617 | ||
2618 | if (NUMA_BUILD) { | 2618 | if (IS_ENABLED(CONFIG_NUMA)) { |
2619 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | 2619 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); |
2620 | if (ptr == NULL) | 2620 | if (ptr == NULL) |
2621 | return -ENOMEM; | 2621 | return -ENOMEM; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index b7ed37675644..adc7e9058181 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page) | |||
1177 | } | 1177 | } |
1178 | 1178 | ||
1179 | /* | 1179 | /* |
1180 | * Are there way too many processes in the direct reclaim path already? | 1180 | * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and |
1181 | * then get resheduled. When there are massive number of tasks doing page | ||
1182 | * allocation, such sleeping direct reclaimers may keep piling up on each CPU, | ||
1183 | * the LRU list will go small and be scanned faster than necessary, leading to | ||
1184 | * unnecessary swapping, thrashing and OOM. | ||
1181 | */ | 1185 | */ |
1182 | static int too_many_isolated(struct zone *zone, int file, | 1186 | static int too_many_isolated(struct zone *zone, int file, |
1183 | struct scan_control *sc) | 1187 | struct scan_control *sc) |
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1198 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); | 1202 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); |
1199 | } | 1203 | } |
1200 | 1204 | ||
1205 | /* | ||
1206 | * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they | ||
1207 | * won't get blocked by normal direct-reclaimers, forming a circular | ||
1208 | * deadlock. | ||
1209 | */ | ||
1210 | if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) | ||
1211 | inactive >>= 3; | ||
1212 | |||
1201 | return isolated > inactive; | 1213 | return isolated > inactive; |
1202 | } | 1214 | } |
1203 | 1215 | ||
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1679 | 1691 | ||
1680 | if (global_reclaim(sc)) { | 1692 | if (global_reclaim(sc)) { |
1681 | free = zone_page_state(zone, NR_FREE_PAGES); | 1693 | free = zone_page_state(zone, NR_FREE_PAGES); |
1682 | /* If we have very few page cache pages, | ||
1683 | force-scan anon pages. */ | ||
1684 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1694 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1695 | /* | ||
1696 | * If we have very few page cache pages, force-scan | ||
1697 | * anon pages. | ||
1698 | */ | ||
1685 | fraction[0] = 1; | 1699 | fraction[0] = 1; |
1686 | fraction[1] = 0; | 1700 | fraction[1] = 0; |
1687 | denominator = 1; | 1701 | denominator = 1; |
1688 | goto out; | 1702 | goto out; |
1703 | } else if (!inactive_file_is_low_global(zone)) { | ||
1704 | /* | ||
1705 | * There is enough inactive page cache, do not | ||
1706 | * reclaim anything from the working set right now. | ||
1707 | */ | ||
1708 | fraction[0] = 0; | ||
1709 | fraction[1] = 1; | ||
1710 | denominator = 1; | ||
1711 | goto out; | ||
1689 | } | 1712 | } |
1690 | } | 1713 | } |
1691 | 1714 | ||
@@ -1752,7 +1775,7 @@ out: | |||
1752 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 1775 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
1753 | static bool in_reclaim_compaction(struct scan_control *sc) | 1776 | static bool in_reclaim_compaction(struct scan_control *sc) |
1754 | { | 1777 | { |
1755 | if (COMPACTION_BUILD && sc->order && | 1778 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && |
1756 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || | 1779 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || |
1757 | sc->priority < DEF_PRIORITY - 2)) | 1780 | sc->priority < DEF_PRIORITY - 2)) |
1758 | return true; | 1781 | return true; |
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2005 | if (zone->all_unreclaimable && | 2028 | if (zone->all_unreclaimable && |
2006 | sc->priority != DEF_PRIORITY) | 2029 | sc->priority != DEF_PRIORITY) |
2007 | continue; /* Let kswapd poll it */ | 2030 | continue; /* Let kswapd poll it */ |
2008 | if (COMPACTION_BUILD) { | 2031 | if (IS_ENABLED(CONFIG_COMPACTION)) { |
2009 | /* | 2032 | /* |
2010 | * If we already have plenty of memory free for | 2033 | * If we already have plenty of memory free for |
2011 | * compaction in this zone, don't free any more. | 2034 | * compaction in this zone, don't free any more. |
@@ -2421,7 +2444,8 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2421 | balance_gap, classzone_idx, 0)) | 2444 | balance_gap, classzone_idx, 0)) |
2422 | return false; | 2445 | return false; |
2423 | 2446 | ||
2424 | if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) | 2447 | if (IS_ENABLED(CONFIG_COMPACTION) && order && |
2448 | !compaction_suitable(zone, order)) | ||
2425 | return false; | 2449 | return false; |
2426 | 2450 | ||
2427 | return true; | 2451 | return true; |
@@ -2546,7 +2570,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2546 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2570 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2547 | int *classzone_idx) | 2571 | int *classzone_idx) |
2548 | { | 2572 | { |
2549 | int all_zones_ok; | 2573 | struct zone *unbalanced_zone; |
2550 | unsigned long balanced; | 2574 | unsigned long balanced; |
2551 | int i; | 2575 | int i; |
2552 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2576 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
@@ -2580,7 +2604,7 @@ loop_again: | |||
2580 | unsigned long lru_pages = 0; | 2604 | unsigned long lru_pages = 0; |
2581 | int has_under_min_watermark_zone = 0; | 2605 | int has_under_min_watermark_zone = 0; |
2582 | 2606 | ||
2583 | all_zones_ok = 1; | 2607 | unbalanced_zone = NULL; |
2584 | balanced = 0; | 2608 | balanced = 0; |
2585 | 2609 | ||
2586 | /* | 2610 | /* |
@@ -2684,7 +2708,7 @@ loop_again: | |||
2684 | * Do not reclaim more than needed for compaction. | 2708 | * Do not reclaim more than needed for compaction. |
2685 | */ | 2709 | */ |
2686 | testorder = order; | 2710 | testorder = order; |
2687 | if (COMPACTION_BUILD && order && | 2711 | if (IS_ENABLED(CONFIG_COMPACTION) && order && |
2688 | compaction_suitable(zone, order) != | 2712 | compaction_suitable(zone, order) != |
2689 | COMPACT_SKIPPED) | 2713 | COMPACT_SKIPPED) |
2690 | testorder = 0; | 2714 | testorder = 0; |
@@ -2719,7 +2743,7 @@ loop_again: | |||
2719 | } | 2743 | } |
2720 | 2744 | ||
2721 | if (!zone_balanced(zone, testorder, 0, end_zone)) { | 2745 | if (!zone_balanced(zone, testorder, 0, end_zone)) { |
2722 | all_zones_ok = 0; | 2746 | unbalanced_zone = zone; |
2723 | /* | 2747 | /* |
2724 | * We are still under min water mark. This | 2748 | * We are still under min water mark. This |
2725 | * means that we have a GFP_ATOMIC allocation | 2749 | * means that we have a GFP_ATOMIC allocation |
@@ -2752,7 +2776,7 @@ loop_again: | |||
2752 | pfmemalloc_watermark_ok(pgdat)) | 2776 | pfmemalloc_watermark_ok(pgdat)) |
2753 | wake_up(&pgdat->pfmemalloc_wait); | 2777 | wake_up(&pgdat->pfmemalloc_wait); |
2754 | 2778 | ||
2755 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2779 | if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2756 | break; /* kswapd: all done */ | 2780 | break; /* kswapd: all done */ |
2757 | /* | 2781 | /* |
2758 | * OK, kswapd is getting into trouble. Take a nap, then take | 2782 | * OK, kswapd is getting into trouble. Take a nap, then take |
@@ -2762,7 +2786,7 @@ loop_again: | |||
2762 | if (has_under_min_watermark_zone) | 2786 | if (has_under_min_watermark_zone) |
2763 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | 2787 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); |
2764 | else | 2788 | else |
2765 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2789 | wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); |
2766 | } | 2790 | } |
2767 | 2791 | ||
2768 | /* | 2792 | /* |
@@ -2781,7 +2805,7 @@ out: | |||
2781 | * high-order: Balanced zones must make up at least 25% of the node | 2805 | * high-order: Balanced zones must make up at least 25% of the node |
2782 | * for the node to be balanced | 2806 | * for the node to be balanced |
2783 | */ | 2807 | */ |
2784 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | 2808 | if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) { |
2785 | cond_resched(); | 2809 | cond_resched(); |
2786 | 2810 | ||
2787 | try_to_freeze(); | 2811 | try_to_freeze(); |
@@ -2951,7 +2975,7 @@ static int kswapd(void *p) | |||
2951 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2975 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2952 | balanced_classzone_idx = classzone_idx; | 2976 | balanced_classzone_idx = classzone_idx; |
2953 | for ( ; ; ) { | 2977 | for ( ; ; ) { |
2954 | int ret; | 2978 | bool ret; |
2955 | 2979 | ||
2956 | /* | 2980 | /* |
2957 | * If the last balance_pgdat was unsuccessful it's unlikely a | 2981 | * If the last balance_pgdat was unsuccessful it's unlikely a |
@@ -3119,7 +3143,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
3119 | int nid; | 3143 | int nid; |
3120 | 3144 | ||
3121 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 3145 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
3122 | for_each_node_state(nid, N_HIGH_MEMORY) { | 3146 | for_each_node_state(nid, N_MEMORY) { |
3123 | pg_data_t *pgdat = NODE_DATA(nid); | 3147 | pg_data_t *pgdat = NODE_DATA(nid); |
3124 | const struct cpumask *mask; | 3148 | const struct cpumask *mask; |
3125 | 3149 | ||
@@ -3175,7 +3199,7 @@ static int __init kswapd_init(void) | |||
3175 | int nid; | 3199 | int nid; |
3176 | 3200 | ||
3177 | swap_setup(); | 3201 | swap_setup(); |
3178 | for_each_node_state(nid, N_HIGH_MEMORY) | 3202 | for_each_node_state(nid, N_MEMORY) |
3179 | kswapd_run(nid); | 3203 | kswapd_run(nid); |
3180 | hotcpu_notifier(cpu_callback, 0); | 3204 | hotcpu_notifier(cpu_callback, 0); |
3181 | return 0; | 3205 | return 0; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c7370579111b..9800306c8195 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = { | |||
774 | 774 | ||
775 | "pgrotated", | 775 | "pgrotated", |
776 | 776 | ||
777 | #ifdef CONFIG_NUMA_BALANCING | ||
778 | "numa_pte_updates", | ||
779 | "numa_hint_faults", | ||
780 | "numa_hint_faults_local", | ||
781 | "numa_pages_migrated", | ||
782 | #endif | ||
783 | #ifdef CONFIG_MIGRATION | ||
784 | "pgmigrate_success", | ||
785 | "pgmigrate_fail", | ||
786 | #endif | ||
777 | #ifdef CONFIG_COMPACTION | 787 | #ifdef CONFIG_COMPACTION |
778 | "compact_blocks_moved", | 788 | "compact_migrate_scanned", |
779 | "compact_pages_moved", | 789 | "compact_free_scanned", |
780 | "compact_pagemigrate_failed", | 790 | "compact_isolated", |
781 | "compact_stall", | 791 | "compact_stall", |
782 | "compact_fail", | 792 | "compact_fail", |
783 | "compact_success", | 793 | "compact_success", |
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = { | |||
801 | "thp_collapse_alloc", | 811 | "thp_collapse_alloc", |
802 | "thp_collapse_alloc_failed", | 812 | "thp_collapse_alloc_failed", |
803 | "thp_split", | 813 | "thp_split", |
814 | "thp_zero_page_alloc", | ||
815 | "thp_zero_page_alloc_failed", | ||
804 | #endif | 816 | #endif |
805 | 817 | ||
806 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 818 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
930 | pg_data_t *pgdat = (pg_data_t *)arg; | 942 | pg_data_t *pgdat = (pg_data_t *)arg; |
931 | 943 | ||
932 | /* check memoryless node */ | 944 | /* check memoryless node */ |
933 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 945 | if (!node_state(pgdat->node_id, N_MEMORY)) |
934 | return 0; | 946 | return 0; |
935 | 947 | ||
936 | seq_printf(m, "Page block order: %d\n", pageblock_order); | 948 | seq_printf(m, "Page block order: %d\n", pageblock_order); |
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
992 | "\n high %lu" | 1004 | "\n high %lu" |
993 | "\n scanned %lu" | 1005 | "\n scanned %lu" |
994 | "\n spanned %lu" | 1006 | "\n spanned %lu" |
995 | "\n present %lu", | 1007 | "\n present %lu" |
1008 | "\n managed %lu", | ||
996 | zone_page_state(zone, NR_FREE_PAGES), | 1009 | zone_page_state(zone, NR_FREE_PAGES), |
997 | min_wmark_pages(zone), | 1010 | min_wmark_pages(zone), |
998 | low_wmark_pages(zone), | 1011 | low_wmark_pages(zone), |
999 | high_wmark_pages(zone), | 1012 | high_wmark_pages(zone), |
1000 | zone->pages_scanned, | 1013 | zone->pages_scanned, |
1001 | zone->spanned_pages, | 1014 | zone->spanned_pages, |
1002 | zone->present_pages); | 1015 | zone->present_pages, |
1016 | zone->managed_pages); | ||
1003 | 1017 | ||
1004 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 1018 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
1005 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 1019 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg) | |||
1292 | pg_data_t *pgdat = (pg_data_t *)arg; | 1306 | pg_data_t *pgdat = (pg_data_t *)arg; |
1293 | 1307 | ||
1294 | /* check memoryless node */ | 1308 | /* check memoryless node */ |
1295 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 1309 | if (!node_state(pgdat->node_id, N_MEMORY)) |
1296 | return 0; | 1310 | return 0; |
1297 | 1311 | ||
1298 | walk_zones_in_node(m, pgdat, unusable_show_print); | 1312 | walk_zones_in_node(m, pgdat, unusable_show_print); |