diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/Makefile | 5 | ||||
-rw-r--r-- | mm/backing-dev.c | 6 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 123 | ||||
-rw-r--r-- | mm/bootmem.c | 4 | ||||
-rw-r--r-- | mm/cma.c | 21 | ||||
-rw-r--r-- | mm/compaction.c | 674 | ||||
-rw-r--r-- | mm/debug.c | 237 | ||||
-rw-r--r-- | mm/dmapool.c | 58 | ||||
-rw-r--r-- | mm/filemap.c | 27 | ||||
-rw-r--r-- | mm/gup.c | 358 | ||||
-rw-r--r-- | mm/huge_memory.c | 35 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 26 | ||||
-rw-r--r-- | mm/interval_tree.c | 2 | ||||
-rw-r--r-- | mm/iov_iter.c | 14 | ||||
-rw-r--r-- | mm/kmemcheck.c | 1 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/memblock.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 421 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 11 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 134 | ||||
-rw-r--r-- | mm/migrate.c | 21 | ||||
-rw-r--r-- | mm/mlock.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 84 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 5 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/nobootmem.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 355 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu-km.c | 16 | ||||
-rw-r--r-- | mm/percpu-vm.c | 184 | ||||
-rw-r--r-- | mm/percpu.c | 524 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 364 | ||||
-rw-r--r-- | mm/slab.h | 57 | ||||
-rw-r--r-- | mm/slab_common.c | 178 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 126 | ||||
-rw-r--r-- | mm/swap.c | 30 | ||||
-rw-r--r-- | mm/swap_state.c | 16 | ||||
-rw-r--r-- | mm/util.c | 23 | ||||
-rw-r--r-- | mm/vmalloc.c | 20 | ||||
-rw-r--r-- | mm/vmscan.c | 112 | ||||
-rw-r--r-- | mm/vmstat.c | 153 | ||||
-rw-r--r-- | mm/zbud.c | 14 | ||||
-rw-r--r-- | mm/zpool.c | 2 | ||||
-rw-r--r-- | mm/zsmalloc.c | 47 |
56 files changed, 2793 insertions, 1795 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 886db2158538..1d1ae6b078fd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP | |||
137 | config HAVE_MEMBLOCK_PHYS_MAP | 137 | config HAVE_MEMBLOCK_PHYS_MAP |
138 | boolean | 138 | boolean |
139 | 139 | ||
140 | config HAVE_GENERIC_RCU_GUP | ||
141 | boolean | ||
142 | |||
140 | config ARCH_DISCARD_MEMBLOCK | 143 | config ARCH_DISCARD_MEMBLOCK |
141 | boolean | 144 | boolean |
142 | 145 | ||
@@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK | |||
228 | boolean | 231 | boolean |
229 | 232 | ||
230 | # | 233 | # |
234 | # support for memory balloon | ||
235 | config MEMORY_BALLOON | ||
236 | boolean | ||
237 | |||
238 | # | ||
231 | # support for memory balloon compaction | 239 | # support for memory balloon compaction |
232 | config BALLOON_COMPACTION | 240 | config BALLOON_COMPACTION |
233 | bool "Allow for balloon memory compaction/migration" | 241 | bool "Allow for balloon memory compaction/migration" |
234 | def_bool y | 242 | def_bool y |
235 | depends on COMPACTION && VIRTIO_BALLOON | 243 | depends on COMPACTION && MEMORY_BALLOON |
236 | help | 244 | help |
237 | Memory fragmentation introduced by ballooning might reduce | 245 | Memory fragmentation introduced by ballooning might reduce |
238 | significantly the number of 2MB contiguous memory blocks that can be | 246 | significantly the number of 2MB contiguous memory blocks that can be |
diff --git a/mm/Makefile b/mm/Makefile index 2ad574d1d12d..8405eb0023a9 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -16,9 +16,9 @@ obj-y := filemap.o mempool.o oom_kill.o \ | |||
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o balloon_compaction.o vmacache.o \ | 19 | compaction.o vmacache.o \ |
20 | interval_tree.o list_lru.o workingset.o \ | 20 | interval_tree.o list_lru.o workingset.o \ |
21 | iov_iter.o $(mmu-y) | 21 | iov_iter.o debug.o $(mmu-y) |
22 | 22 | ||
23 | obj-y += init-mm.o | 23 | obj-y += init-mm.o |
24 | 24 | ||
@@ -68,3 +68,4 @@ obj-$(CONFIG_ZBUD) += zbud.o | |||
68 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | 68 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o |
69 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | 69 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o |
70 | obj-$(CONFIG_CMA) += cma.o | 70 | obj-$(CONFIG_CMA) += cma.o |
71 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..12a992b62576 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
455 | bdi_wb_init(&bdi->wb, bdi); | 455 | bdi_wb_init(&bdi->wb, bdi); |
456 | 456 | ||
457 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 457 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
458 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); | 458 | err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); |
459 | if (err) | 459 | if (err) |
460 | goto err; | 460 | goto err; |
461 | } | 461 | } |
@@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
470 | bdi->write_bandwidth = INIT_BW; | 470 | bdi->write_bandwidth = INIT_BW; |
471 | bdi->avg_write_bandwidth = INIT_BW; | 471 | bdi->avg_write_bandwidth = INIT_BW; |
472 | 472 | ||
473 | err = fprop_local_init_percpu(&bdi->completions); | 473 | err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); |
474 | 474 | ||
475 | if (err) { | 475 | if (err) { |
476 | err: | 476 | err: |
@@ -631,7 +631,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) | |||
631 | * of sleeping on the congestion queue | 631 | * of sleeping on the congestion queue |
632 | */ | 632 | */ |
633 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || | 633 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || |
634 | !zone_is_reclaim_congested(zone)) { | 634 | !test_bit(ZONE_CONGESTED, &zone->flags)) { |
635 | cond_resched(); | 635 | cond_resched(); |
636 | 636 | ||
637 | /* In case we scheduled, work out time remaining */ | 637 | /* In case we scheduled, work out time remaining */ |
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 6e45a5074bf0..b3cbe19f71b5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c | |||
@@ -11,32 +11,6 @@ | |||
11 | #include <linux/balloon_compaction.h> | 11 | #include <linux/balloon_compaction.h> |
12 | 12 | ||
13 | /* | 13 | /* |
14 | * balloon_devinfo_alloc - allocates a balloon device information descriptor. | ||
15 | * @balloon_dev_descriptor: pointer to reference the balloon device which | ||
16 | * this struct balloon_dev_info will be servicing. | ||
17 | * | ||
18 | * Driver must call it to properly allocate and initialize an instance of | ||
19 | * struct balloon_dev_info which will be used to reference a balloon device | ||
20 | * as well as to keep track of the balloon device page list. | ||
21 | */ | ||
22 | struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) | ||
23 | { | ||
24 | struct balloon_dev_info *b_dev_info; | ||
25 | b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); | ||
26 | if (!b_dev_info) | ||
27 | return ERR_PTR(-ENOMEM); | ||
28 | |||
29 | b_dev_info->balloon_device = balloon_dev_descriptor; | ||
30 | b_dev_info->mapping = NULL; | ||
31 | b_dev_info->isolated_pages = 0; | ||
32 | spin_lock_init(&b_dev_info->pages_lock); | ||
33 | INIT_LIST_HEAD(&b_dev_info->pages); | ||
34 | |||
35 | return b_dev_info; | ||
36 | } | ||
37 | EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); | ||
38 | |||
39 | /* | ||
40 | * balloon_page_enqueue - allocates a new page and inserts it into the balloon | 14 | * balloon_page_enqueue - allocates a new page and inserts it into the balloon |
41 | * page list. | 15 | * page list. |
42 | * @b_dev_info: balloon device decriptor where we will insert a new page to | 16 | * @b_dev_info: balloon device decriptor where we will insert a new page to |
@@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) | |||
61 | */ | 35 | */ |
62 | BUG_ON(!trylock_page(page)); | 36 | BUG_ON(!trylock_page(page)); |
63 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 37 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
64 | balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); | 38 | balloon_page_insert(b_dev_info, page); |
39 | __count_vm_event(BALLOON_INFLATE); | ||
65 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 40 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
66 | unlock_page(page); | 41 | unlock_page(page); |
67 | return page; | 42 | return page; |
@@ -93,18 +68,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | |||
93 | * to be released by the balloon driver. | 68 | * to be released by the balloon driver. |
94 | */ | 69 | */ |
95 | if (trylock_page(page)) { | 70 | if (trylock_page(page)) { |
71 | if (!PagePrivate(page)) { | ||
72 | /* raced with isolation */ | ||
73 | unlock_page(page); | ||
74 | continue; | ||
75 | } | ||
96 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 76 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
97 | /* | ||
98 | * Raise the page refcount here to prevent any wrong | ||
99 | * attempt to isolate this page, in case of coliding | ||
100 | * with balloon_page_isolate() just after we release | ||
101 | * the page lock. | ||
102 | * | ||
103 | * balloon_page_free() will take care of dropping | ||
104 | * this extra refcount later. | ||
105 | */ | ||
106 | get_page(page); | ||
107 | balloon_page_delete(page); | 77 | balloon_page_delete(page); |
78 | __count_vm_event(BALLOON_DEFLATE); | ||
108 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 79 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
109 | unlock_page(page); | 80 | unlock_page(page); |
110 | dequeued_page = true; | 81 | dequeued_page = true; |
@@ -132,62 +103,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | |||
132 | EXPORT_SYMBOL_GPL(balloon_page_dequeue); | 103 | EXPORT_SYMBOL_GPL(balloon_page_dequeue); |
133 | 104 | ||
134 | #ifdef CONFIG_BALLOON_COMPACTION | 105 | #ifdef CONFIG_BALLOON_COMPACTION |
135 | /* | ||
136 | * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. | ||
137 | * @b_dev_info: holds the balloon device information descriptor. | ||
138 | * @a_ops: balloon_mapping address_space_operations descriptor. | ||
139 | * | ||
140 | * Driver must call it to properly allocate and initialize an instance of | ||
141 | * struct address_space which will be used as the special page->mapping for | ||
142 | * balloon device enlisted page instances. | ||
143 | */ | ||
144 | struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, | ||
145 | const struct address_space_operations *a_ops) | ||
146 | { | ||
147 | struct address_space *mapping; | ||
148 | |||
149 | mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); | ||
150 | if (!mapping) | ||
151 | return ERR_PTR(-ENOMEM); | ||
152 | |||
153 | /* | ||
154 | * Give a clean 'zeroed' status to all elements of this special | ||
155 | * balloon page->mapping struct address_space instance. | ||
156 | */ | ||
157 | address_space_init_once(mapping); | ||
158 | |||
159 | /* | ||
160 | * Set mapping->flags appropriately, to allow balloon pages | ||
161 | * ->mapping identification. | ||
162 | */ | ||
163 | mapping_set_balloon(mapping); | ||
164 | mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); | ||
165 | |||
166 | /* balloon's page->mapping->a_ops callback descriptor */ | ||
167 | mapping->a_ops = a_ops; | ||
168 | |||
169 | /* | ||
170 | * Establish a pointer reference back to the balloon device descriptor | ||
171 | * this particular page->mapping will be servicing. | ||
172 | * This is used by compaction / migration procedures to identify and | ||
173 | * access the balloon device pageset while isolating / migrating pages. | ||
174 | * | ||
175 | * As some balloon drivers can register multiple balloon devices | ||
176 | * for a single guest, this also helps compaction / migration to | ||
177 | * properly deal with multiple balloon pagesets, when required. | ||
178 | */ | ||
179 | mapping->private_data = b_dev_info; | ||
180 | b_dev_info->mapping = mapping; | ||
181 | |||
182 | return mapping; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(balloon_mapping_alloc); | ||
185 | 106 | ||
186 | static inline void __isolate_balloon_page(struct page *page) | 107 | static inline void __isolate_balloon_page(struct page *page) |
187 | { | 108 | { |
188 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | 109 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); |
189 | unsigned long flags; | 110 | unsigned long flags; |
111 | |||
190 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 112 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
113 | ClearPagePrivate(page); | ||
191 | list_del(&page->lru); | 114 | list_del(&page->lru); |
192 | b_dev_info->isolated_pages++; | 115 | b_dev_info->isolated_pages++; |
193 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 116 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
@@ -195,20 +118,16 @@ static inline void __isolate_balloon_page(struct page *page) | |||
195 | 118 | ||
196 | static inline void __putback_balloon_page(struct page *page) | 119 | static inline void __putback_balloon_page(struct page *page) |
197 | { | 120 | { |
198 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | 121 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); |
199 | unsigned long flags; | 122 | unsigned long flags; |
123 | |||
200 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 124 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
125 | SetPagePrivate(page); | ||
201 | list_add(&page->lru, &b_dev_info->pages); | 126 | list_add(&page->lru, &b_dev_info->pages); |
202 | b_dev_info->isolated_pages--; | 127 | b_dev_info->isolated_pages--; |
203 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 128 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
204 | } | 129 | } |
205 | 130 | ||
206 | static inline int __migrate_balloon_page(struct address_space *mapping, | ||
207 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
208 | { | ||
209 | return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); | ||
210 | } | ||
211 | |||
212 | /* __isolate_lru_page() counterpart for a ballooned page */ | 131 | /* __isolate_lru_page() counterpart for a ballooned page */ |
213 | bool balloon_page_isolate(struct page *page) | 132 | bool balloon_page_isolate(struct page *page) |
214 | { | 133 | { |
@@ -235,12 +154,11 @@ bool balloon_page_isolate(struct page *page) | |||
235 | */ | 154 | */ |
236 | if (likely(trylock_page(page))) { | 155 | if (likely(trylock_page(page))) { |
237 | /* | 156 | /* |
238 | * A ballooned page, by default, has just one refcount. | 157 | * A ballooned page, by default, has PagePrivate set. |
239 | * Prevent concurrent compaction threads from isolating | 158 | * Prevent concurrent compaction threads from isolating |
240 | * an already isolated balloon page by refcount check. | 159 | * an already isolated balloon page by clearing it. |
241 | */ | 160 | */ |
242 | if (__is_movable_balloon_page(page) && | 161 | if (balloon_page_movable(page)) { |
243 | page_count(page) == 2) { | ||
244 | __isolate_balloon_page(page); | 162 | __isolate_balloon_page(page); |
245 | unlock_page(page); | 163 | unlock_page(page); |
246 | return true; | 164 | return true; |
@@ -276,7 +194,7 @@ void balloon_page_putback(struct page *page) | |||
276 | int balloon_page_migrate(struct page *newpage, | 194 | int balloon_page_migrate(struct page *newpage, |
277 | struct page *page, enum migrate_mode mode) | 195 | struct page *page, enum migrate_mode mode) |
278 | { | 196 | { |
279 | struct address_space *mapping; | 197 | struct balloon_dev_info *balloon = balloon_page_device(page); |
280 | int rc = -EAGAIN; | 198 | int rc = -EAGAIN; |
281 | 199 | ||
282 | /* | 200 | /* |
@@ -292,9 +210,8 @@ int balloon_page_migrate(struct page *newpage, | |||
292 | return rc; | 210 | return rc; |
293 | } | 211 | } |
294 | 212 | ||
295 | mapping = page->mapping; | 213 | if (balloon && balloon->migratepage) |
296 | if (mapping) | 214 | rc = balloon->migratepage(balloon, newpage, page, mode); |
297 | rc = __migrate_balloon_page(mapping, newpage, page, mode); | ||
298 | 215 | ||
299 | unlock_page(newpage); | 216 | unlock_page(newpage); |
300 | return rc; | 217 | return rc; |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 90bd3507b413..8a000cebb0d7 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -16,9 +16,9 @@ | |||
16 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 17 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | 18 | #include <linux/memblock.h> |
19 | #include <linux/bug.h> | ||
20 | #include <linux/io.h> | ||
19 | 21 | ||
20 | #include <asm/bug.h> | ||
21 | #include <asm/io.h> | ||
22 | #include <asm/processor.h> | 22 | #include <asm/processor.h> |
23 | 23 | ||
24 | #include "internal.h" | 24 | #include "internal.h" |
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/log2.h> | 33 | #include <linux/log2.h> |
34 | #include <linux/cma.h> | 34 | #include <linux/cma.h> |
35 | #include <linux/highmem.h> | ||
35 | 36 | ||
36 | struct cma { | 37 | struct cma { |
37 | unsigned long base_pfn; | 38 | unsigned long base_pfn; |
@@ -163,6 +164,8 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
163 | bool fixed, struct cma **res_cma) | 164 | bool fixed, struct cma **res_cma) |
164 | { | 165 | { |
165 | struct cma *cma; | 166 | struct cma *cma; |
167 | phys_addr_t memblock_end = memblock_end_of_DRAM(); | ||
168 | phys_addr_t highmem_start = __pa(high_memory); | ||
166 | int ret = 0; | 169 | int ret = 0; |
167 | 170 | ||
168 | pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", | 171 | pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", |
@@ -196,6 +199,24 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
196 | if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) | 199 | if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) |
197 | return -EINVAL; | 200 | return -EINVAL; |
198 | 201 | ||
202 | /* | ||
203 | * adjust limit to avoid crossing low/high memory boundary for | ||
204 | * automatically allocated regions | ||
205 | */ | ||
206 | if (((limit == 0 || limit > memblock_end) && | ||
207 | (memblock_end - size < highmem_start && | ||
208 | memblock_end > highmem_start)) || | ||
209 | (!fixed && limit > highmem_start && limit - size < highmem_start)) { | ||
210 | limit = highmem_start; | ||
211 | } | ||
212 | |||
213 | if (fixed && base < highmem_start && base+size > highmem_start) { | ||
214 | ret = -EINVAL; | ||
215 | pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", | ||
216 | (unsigned long)base, (unsigned long)highmem_start); | ||
217 | goto err; | ||
218 | } | ||
219 | |||
199 | /* Reserve memory */ | 220 | /* Reserve memory */ |
200 | if (base && fixed) { | 221 | if (base && fixed) { |
201 | if (memblock_is_region_reserved(base, size) || | 222 | if (memblock_is_region_reserved(base, size) || |
diff --git a/mm/compaction.c b/mm/compaction.c index 21bf292b642a..edba18aed173 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype) | |||
67 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | 67 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
68 | } | 68 | } |
69 | 69 | ||
70 | /* | ||
71 | * Check that the whole (or subset of) a pageblock given by the interval of | ||
72 | * [start_pfn, end_pfn) is valid and within the same zone, before scanning it | ||
73 | * with the migration of free compaction scanner. The scanners then need to | ||
74 | * use only pfn_valid_within() check for arches that allow holes within | ||
75 | * pageblocks. | ||
76 | * | ||
77 | * Return struct page pointer of start_pfn, or NULL if checks were not passed. | ||
78 | * | ||
79 | * It's possible on some configurations to have a setup like node0 node1 node0 | ||
80 | * i.e. it's possible that all pages within a zones range of pages do not | ||
81 | * belong to a single zone. We assume that a border between node0 and node1 | ||
82 | * can occur within a single pageblock, but not a node0 node1 node0 | ||
83 | * interleaving within a single pageblock. It is therefore sufficient to check | ||
84 | * the first and last page of a pageblock and avoid checking each individual | ||
85 | * page in a pageblock. | ||
86 | */ | ||
87 | static struct page *pageblock_pfn_to_page(unsigned long start_pfn, | ||
88 | unsigned long end_pfn, struct zone *zone) | ||
89 | { | ||
90 | struct page *start_page; | ||
91 | struct page *end_page; | ||
92 | |||
93 | /* end_pfn is one past the range we are checking */ | ||
94 | end_pfn--; | ||
95 | |||
96 | if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) | ||
97 | return NULL; | ||
98 | |||
99 | start_page = pfn_to_page(start_pfn); | ||
100 | |||
101 | if (page_zone(start_page) != zone) | ||
102 | return NULL; | ||
103 | |||
104 | end_page = pfn_to_page(end_pfn); | ||
105 | |||
106 | /* This gives a shorter code than deriving page_zone(end_page) */ | ||
107 | if (page_zone_id(start_page) != page_zone_id(end_page)) | ||
108 | return NULL; | ||
109 | |||
110 | return start_page; | ||
111 | } | ||
112 | |||
70 | #ifdef CONFIG_COMPACTION | 113 | #ifdef CONFIG_COMPACTION |
71 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | 114 | /* Returns true if the pageblock should be scanned for pages to isolate. */ |
72 | static inline bool isolation_suitable(struct compact_control *cc, | 115 | static inline bool isolation_suitable(struct compact_control *cc, |
@@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat) | |||
132 | */ | 175 | */ |
133 | static void update_pageblock_skip(struct compact_control *cc, | 176 | static void update_pageblock_skip(struct compact_control *cc, |
134 | struct page *page, unsigned long nr_isolated, | 177 | struct page *page, unsigned long nr_isolated, |
135 | bool set_unsuitable, bool migrate_scanner) | 178 | bool migrate_scanner) |
136 | { | 179 | { |
137 | struct zone *zone = cc->zone; | 180 | struct zone *zone = cc->zone; |
138 | unsigned long pfn; | 181 | unsigned long pfn; |
@@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
146 | if (nr_isolated) | 189 | if (nr_isolated) |
147 | return; | 190 | return; |
148 | 191 | ||
149 | /* | 192 | set_pageblock_skip(page); |
150 | * Only skip pageblocks when all forms of compaction will be known to | ||
151 | * fail in the near future. | ||
152 | */ | ||
153 | if (set_unsuitable) | ||
154 | set_pageblock_skip(page); | ||
155 | 193 | ||
156 | pfn = page_to_pfn(page); | 194 | pfn = page_to_pfn(page); |
157 | 195 | ||
@@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
180 | 218 | ||
181 | static void update_pageblock_skip(struct compact_control *cc, | 219 | static void update_pageblock_skip(struct compact_control *cc, |
182 | struct page *page, unsigned long nr_isolated, | 220 | struct page *page, unsigned long nr_isolated, |
183 | bool set_unsuitable, bool migrate_scanner) | 221 | bool migrate_scanner) |
184 | { | 222 | { |
185 | } | 223 | } |
186 | #endif /* CONFIG_COMPACTION */ | 224 | #endif /* CONFIG_COMPACTION */ |
187 | 225 | ||
188 | static inline bool should_release_lock(spinlock_t *lock) | 226 | /* |
227 | * Compaction requires the taking of some coarse locks that are potentially | ||
228 | * very heavily contended. For async compaction, back out if the lock cannot | ||
229 | * be taken immediately. For sync compaction, spin on the lock if needed. | ||
230 | * | ||
231 | * Returns true if the lock is held | ||
232 | * Returns false if the lock is not held and compaction should abort | ||
233 | */ | ||
234 | static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, | ||
235 | struct compact_control *cc) | ||
189 | { | 236 | { |
190 | return need_resched() || spin_is_contended(lock); | 237 | if (cc->mode == MIGRATE_ASYNC) { |
238 | if (!spin_trylock_irqsave(lock, *flags)) { | ||
239 | cc->contended = COMPACT_CONTENDED_LOCK; | ||
240 | return false; | ||
241 | } | ||
242 | } else { | ||
243 | spin_lock_irqsave(lock, *flags); | ||
244 | } | ||
245 | |||
246 | return true; | ||
191 | } | 247 | } |
192 | 248 | ||
193 | /* | 249 | /* |
194 | * Compaction requires the taking of some coarse locks that are potentially | 250 | * Compaction requires the taking of some coarse locks that are potentially |
195 | * very heavily contended. Check if the process needs to be scheduled or | 251 | * very heavily contended. The lock should be periodically unlocked to avoid |
196 | * if the lock is contended. For async compaction, back out in the event | 252 | * having disabled IRQs for a long time, even when there is nobody waiting on |
197 | * if contention is severe. For sync compaction, schedule. | 253 | * the lock. It might also be that allowing the IRQs will result in |
254 | * need_resched() becoming true. If scheduling is needed, async compaction | ||
255 | * aborts. Sync compaction schedules. | ||
256 | * Either compaction type will also abort if a fatal signal is pending. | ||
257 | * In either case if the lock was locked, it is dropped and not regained. | ||
198 | * | 258 | * |
199 | * Returns true if the lock is held. | 259 | * Returns true if compaction should abort due to fatal signal pending, or |
200 | * Returns false if the lock is released and compaction should abort | 260 | * async compaction due to need_resched() |
261 | * Returns false when compaction can continue (sync compaction might have | ||
262 | * scheduled) | ||
201 | */ | 263 | */ |
202 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | 264 | static bool compact_unlock_should_abort(spinlock_t *lock, |
203 | bool locked, struct compact_control *cc) | 265 | unsigned long flags, bool *locked, struct compact_control *cc) |
204 | { | 266 | { |
205 | if (should_release_lock(lock)) { | 267 | if (*locked) { |
206 | if (locked) { | 268 | spin_unlock_irqrestore(lock, flags); |
207 | spin_unlock_irqrestore(lock, *flags); | 269 | *locked = false; |
208 | locked = false; | 270 | } |
209 | } | 271 | |
272 | if (fatal_signal_pending(current)) { | ||
273 | cc->contended = COMPACT_CONTENDED_SCHED; | ||
274 | return true; | ||
275 | } | ||
210 | 276 | ||
211 | /* async aborts if taking too long or contended */ | 277 | if (need_resched()) { |
212 | if (cc->mode == MIGRATE_ASYNC) { | 278 | if (cc->mode == MIGRATE_ASYNC) { |
213 | cc->contended = true; | 279 | cc->contended = COMPACT_CONTENDED_SCHED; |
214 | return false; | 280 | return true; |
215 | } | 281 | } |
216 | |||
217 | cond_resched(); | 282 | cond_resched(); |
218 | } | 283 | } |
219 | 284 | ||
220 | if (!locked) | 285 | return false; |
221 | spin_lock_irqsave(lock, *flags); | ||
222 | return true; | ||
223 | } | 286 | } |
224 | 287 | ||
225 | /* | 288 | /* |
226 | * Aside from avoiding lock contention, compaction also periodically checks | 289 | * Aside from avoiding lock contention, compaction also periodically checks |
227 | * need_resched() and either schedules in sync compaction or aborts async | 290 | * need_resched() and either schedules in sync compaction or aborts async |
228 | * compaction. This is similar to what compact_checklock_irqsave() does, but | 291 | * compaction. This is similar to what compact_unlock_should_abort() does, but |
229 | * is used where no lock is concerned. | 292 | * is used where no lock is concerned. |
230 | * | 293 | * |
231 | * Returns false when no scheduling was needed, or sync compaction scheduled. | 294 | * Returns false when no scheduling was needed, or sync compaction scheduled. |
@@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
236 | /* async compaction aborts if contended */ | 299 | /* async compaction aborts if contended */ |
237 | if (need_resched()) { | 300 | if (need_resched()) { |
238 | if (cc->mode == MIGRATE_ASYNC) { | 301 | if (cc->mode == MIGRATE_ASYNC) { |
239 | cc->contended = true; | 302 | cc->contended = COMPACT_CONTENDED_SCHED; |
240 | return true; | 303 | return true; |
241 | } | 304 | } |
242 | 305 | ||
@@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
250 | static bool suitable_migration_target(struct page *page) | 313 | static bool suitable_migration_target(struct page *page) |
251 | { | 314 | { |
252 | /* If the page is a large free page, then disallow migration */ | 315 | /* If the page is a large free page, then disallow migration */ |
253 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 316 | if (PageBuddy(page)) { |
254 | return false; | 317 | /* |
318 | * We are checking page_order without zone->lock taken. But | ||
319 | * the only small danger is that we skip a potentially suitable | ||
320 | * pageblock, so it's not worth to check order for valid range. | ||
321 | */ | ||
322 | if (page_order_unsafe(page) >= pageblock_order) | ||
323 | return false; | ||
324 | } | ||
255 | 325 | ||
256 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | 326 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
257 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | 327 | if (migrate_async_suitable(get_pageblock_migratetype(page))) |
@@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page) | |||
267 | * (even though it may still end up isolating some pages). | 337 | * (even though it may still end up isolating some pages). |
268 | */ | 338 | */ |
269 | static unsigned long isolate_freepages_block(struct compact_control *cc, | 339 | static unsigned long isolate_freepages_block(struct compact_control *cc, |
270 | unsigned long blockpfn, | 340 | unsigned long *start_pfn, |
271 | unsigned long end_pfn, | 341 | unsigned long end_pfn, |
272 | struct list_head *freelist, | 342 | struct list_head *freelist, |
273 | bool strict) | 343 | bool strict) |
274 | { | 344 | { |
275 | int nr_scanned = 0, total_isolated = 0; | 345 | int nr_scanned = 0, total_isolated = 0; |
276 | struct page *cursor, *valid_page = NULL; | 346 | struct page *cursor, *valid_page = NULL; |
277 | unsigned long flags; | 347 | unsigned long flags = 0; |
278 | bool locked = false; | 348 | bool locked = false; |
279 | bool checked_pageblock = false; | 349 | unsigned long blockpfn = *start_pfn; |
280 | 350 | ||
281 | cursor = pfn_to_page(blockpfn); | 351 | cursor = pfn_to_page(blockpfn); |
282 | 352 | ||
@@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
285 | int isolated, i; | 355 | int isolated, i; |
286 | struct page *page = cursor; | 356 | struct page *page = cursor; |
287 | 357 | ||
358 | /* | ||
359 | * Periodically drop the lock (if held) regardless of its | ||
360 | * contention, to give chance to IRQs. Abort if fatal signal | ||
361 | * pending or async compaction detects need_resched() | ||
362 | */ | ||
363 | if (!(blockpfn % SWAP_CLUSTER_MAX) | ||
364 | && compact_unlock_should_abort(&cc->zone->lock, flags, | ||
365 | &locked, cc)) | ||
366 | break; | ||
367 | |||
288 | nr_scanned++; | 368 | nr_scanned++; |
289 | if (!pfn_valid_within(blockpfn)) | 369 | if (!pfn_valid_within(blockpfn)) |
290 | goto isolate_fail; | 370 | goto isolate_fail; |
@@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
295 | goto isolate_fail; | 375 | goto isolate_fail; |
296 | 376 | ||
297 | /* | 377 | /* |
298 | * The zone lock must be held to isolate freepages. | 378 | * If we already hold the lock, we can skip some rechecking. |
299 | * Unfortunately this is a very coarse lock and can be | 379 | * Note that if we hold the lock now, checked_pageblock was |
300 | * heavily contended if there are parallel allocations | 380 | * already set in some previous iteration (or strict is true), |
301 | * or parallel compactions. For async compaction do not | 381 | * so it is correct to skip the suitable migration target |
302 | * spin on the lock and we acquire the lock as late as | 382 | * recheck as well. |
303 | * possible. | ||
304 | */ | 383 | */ |
305 | locked = compact_checklock_irqsave(&cc->zone->lock, &flags, | 384 | if (!locked) { |
306 | locked, cc); | ||
307 | if (!locked) | ||
308 | break; | ||
309 | |||
310 | /* Recheck this is a suitable migration target under lock */ | ||
311 | if (!strict && !checked_pageblock) { | ||
312 | /* | 385 | /* |
313 | * We need to check suitability of pageblock only once | 386 | * The zone lock must be held to isolate freepages. |
314 | * and this isolate_freepages_block() is called with | 387 | * Unfortunately this is a very coarse lock and can be |
315 | * pageblock range, so just check once is sufficient. | 388 | * heavily contended if there are parallel allocations |
389 | * or parallel compactions. For async compaction do not | ||
390 | * spin on the lock and we acquire the lock as late as | ||
391 | * possible. | ||
316 | */ | 392 | */ |
317 | checked_pageblock = true; | 393 | locked = compact_trylock_irqsave(&cc->zone->lock, |
318 | if (!suitable_migration_target(page)) | 394 | &flags, cc); |
395 | if (!locked) | ||
319 | break; | 396 | break; |
320 | } | ||
321 | 397 | ||
322 | /* Recheck this is a buddy page under lock */ | 398 | /* Recheck this is a buddy page under lock */ |
323 | if (!PageBuddy(page)) | 399 | if (!PageBuddy(page)) |
324 | goto isolate_fail; | 400 | goto isolate_fail; |
401 | } | ||
325 | 402 | ||
326 | /* Found a free page, break it into order-0 pages */ | 403 | /* Found a free page, break it into order-0 pages */ |
327 | isolated = split_free_page(page); | 404 | isolated = split_free_page(page); |
@@ -346,6 +423,9 @@ isolate_fail: | |||
346 | 423 | ||
347 | } | 424 | } |
348 | 425 | ||
426 | /* Record how far we have got within the block */ | ||
427 | *start_pfn = blockpfn; | ||
428 | |||
349 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | 429 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); |
350 | 430 | ||
351 | /* | 431 | /* |
@@ -361,8 +441,7 @@ isolate_fail: | |||
361 | 441 | ||
362 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 442 | /* Update the pageblock-skip if the whole pageblock was scanned */ |
363 | if (blockpfn == end_pfn) | 443 | if (blockpfn == end_pfn) |
364 | update_pageblock_skip(cc, valid_page, total_isolated, true, | 444 | update_pageblock_skip(cc, valid_page, total_isolated, false); |
365 | false); | ||
366 | 445 | ||
367 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); | 446 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); |
368 | if (total_isolated) | 447 | if (total_isolated) |
@@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc, | |||
390 | unsigned long isolated, pfn, block_end_pfn; | 469 | unsigned long isolated, pfn, block_end_pfn; |
391 | LIST_HEAD(freelist); | 470 | LIST_HEAD(freelist); |
392 | 471 | ||
393 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { | 472 | pfn = start_pfn; |
394 | if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) | 473 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
395 | break; | 474 | |
475 | for (; pfn < end_pfn; pfn += isolated, | ||
476 | block_end_pfn += pageblock_nr_pages) { | ||
477 | /* Protect pfn from changing by isolate_freepages_block */ | ||
478 | unsigned long isolate_start_pfn = pfn; | ||
396 | 479 | ||
397 | /* | ||
398 | * On subsequent iterations ALIGN() is actually not needed, | ||
399 | * but we keep it that we not to complicate the code. | ||
400 | */ | ||
401 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
402 | block_end_pfn = min(block_end_pfn, end_pfn); | 480 | block_end_pfn = min(block_end_pfn, end_pfn); |
403 | 481 | ||
404 | isolated = isolate_freepages_block(cc, pfn, block_end_pfn, | 482 | if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) |
405 | &freelist, true); | 483 | break; |
484 | |||
485 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, | ||
486 | block_end_pfn, &freelist, true); | ||
406 | 487 | ||
407 | /* | 488 | /* |
408 | * In strict mode, isolate_freepages_block() returns 0 if | 489 | * In strict mode, isolate_freepages_block() returns 0 if |
@@ -433,22 +514,19 @@ isolate_freepages_range(struct compact_control *cc, | |||
433 | } | 514 | } |
434 | 515 | ||
435 | /* Update the number of anon and file isolated pages in the zone */ | 516 | /* Update the number of anon and file isolated pages in the zone */ |
436 | static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) | 517 | static void acct_isolated(struct zone *zone, struct compact_control *cc) |
437 | { | 518 | { |
438 | struct page *page; | 519 | struct page *page; |
439 | unsigned int count[2] = { 0, }; | 520 | unsigned int count[2] = { 0, }; |
440 | 521 | ||
522 | if (list_empty(&cc->migratepages)) | ||
523 | return; | ||
524 | |||
441 | list_for_each_entry(page, &cc->migratepages, lru) | 525 | list_for_each_entry(page, &cc->migratepages, lru) |
442 | count[!!page_is_file_cache(page)]++; | 526 | count[!!page_is_file_cache(page)]++; |
443 | 527 | ||
444 | /* If locked we can use the interrupt unsafe versions */ | 528 | mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); |
445 | if (locked) { | 529 | mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); |
446 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | ||
447 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | ||
448 | } else { | ||
449 | mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | ||
450 | mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | ||
451 | } | ||
452 | } | 530 | } |
453 | 531 | ||
454 | /* Similar to reclaim, but different enough that they don't share logic */ | 532 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -467,40 +545,34 @@ static bool too_many_isolated(struct zone *zone) | |||
467 | } | 545 | } |
468 | 546 | ||
469 | /** | 547 | /** |
470 | * isolate_migratepages_range() - isolate all migrate-able pages in range. | 548 | * isolate_migratepages_block() - isolate all migrate-able pages within |
471 | * @zone: Zone pages are in. | 549 | * a single pageblock |
472 | * @cc: Compaction control structure. | 550 | * @cc: Compaction control structure. |
473 | * @low_pfn: The first PFN of the range. | 551 | * @low_pfn: The first PFN to isolate |
474 | * @end_pfn: The one-past-the-last PFN of the range. | 552 | * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock |
475 | * @unevictable: true if it allows to isolate unevictable pages | 553 | * @isolate_mode: Isolation mode to be used. |
476 | * | 554 | * |
477 | * Isolate all pages that can be migrated from the range specified by | 555 | * Isolate all pages that can be migrated from the range specified by |
478 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal | 556 | * [low_pfn, end_pfn). The range is expected to be within same pageblock. |
479 | * pending), otherwise PFN of the first page that was not scanned | 557 | * Returns zero if there is a fatal signal pending, otherwise PFN of the |
480 | * (which may be both less, equal to or more then end_pfn). | 558 | * first page that was not scanned (which may be both less, equal to or more |
559 | * than end_pfn). | ||
481 | * | 560 | * |
482 | * Assumes that cc->migratepages is empty and cc->nr_migratepages is | 561 | * The pages are isolated on cc->migratepages list (not required to be empty), |
483 | * zero. | 562 | * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field |
484 | * | 563 | * is neither read nor updated. |
485 | * Apart from cc->migratepages and cc->nr_migratetypes this function | ||
486 | * does not modify any cc's fields, in particular it does not modify | ||
487 | * (or read for that matter) cc->migrate_pfn. | ||
488 | */ | 564 | */ |
489 | unsigned long | 565 | static unsigned long |
490 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 566 | isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, |
491 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable) | 567 | unsigned long end_pfn, isolate_mode_t isolate_mode) |
492 | { | 568 | { |
493 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 569 | struct zone *zone = cc->zone; |
494 | unsigned long nr_scanned = 0, nr_isolated = 0; | 570 | unsigned long nr_scanned = 0, nr_isolated = 0; |
495 | struct list_head *migratelist = &cc->migratepages; | 571 | struct list_head *migratelist = &cc->migratepages; |
496 | struct lruvec *lruvec; | 572 | struct lruvec *lruvec; |
497 | unsigned long flags; | 573 | unsigned long flags = 0; |
498 | bool locked = false; | 574 | bool locked = false; |
499 | struct page *page = NULL, *valid_page = NULL; | 575 | struct page *page = NULL, *valid_page = NULL; |
500 | bool set_unsuitable = true; | ||
501 | const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? | ||
502 | ISOLATE_ASYNC_MIGRATE : 0) | | ||
503 | (unevictable ? ISOLATE_UNEVICTABLE : 0); | ||
504 | 576 | ||
505 | /* | 577 | /* |
506 | * Ensure that there are not too many pages isolated from the LRU | 578 | * Ensure that there are not too many pages isolated from the LRU |
@@ -523,72 +595,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
523 | 595 | ||
524 | /* Time to isolate some pages for migration */ | 596 | /* Time to isolate some pages for migration */ |
525 | for (; low_pfn < end_pfn; low_pfn++) { | 597 | for (; low_pfn < end_pfn; low_pfn++) { |
526 | /* give a chance to irqs before checking need_resched() */ | ||
527 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { | ||
528 | if (should_release_lock(&zone->lru_lock)) { | ||
529 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
530 | locked = false; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | /* | 598 | /* |
535 | * migrate_pfn does not necessarily start aligned to a | 599 | * Periodically drop the lock (if held) regardless of its |
536 | * pageblock. Ensure that pfn_valid is called when moving | 600 | * contention, to give chance to IRQs. Abort async compaction |
537 | * into a new MAX_ORDER_NR_PAGES range in case of large | 601 | * if contended. |
538 | * memory holes within the zone | ||
539 | */ | 602 | */ |
540 | if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { | 603 | if (!(low_pfn % SWAP_CLUSTER_MAX) |
541 | if (!pfn_valid(low_pfn)) { | 604 | && compact_unlock_should_abort(&zone->lru_lock, flags, |
542 | low_pfn += MAX_ORDER_NR_PAGES - 1; | 605 | &locked, cc)) |
543 | continue; | 606 | break; |
544 | } | ||
545 | } | ||
546 | 607 | ||
547 | if (!pfn_valid_within(low_pfn)) | 608 | if (!pfn_valid_within(low_pfn)) |
548 | continue; | 609 | continue; |
549 | nr_scanned++; | 610 | nr_scanned++; |
550 | 611 | ||
551 | /* | ||
552 | * Get the page and ensure the page is within the same zone. | ||
553 | * See the comment in isolate_freepages about overlapping | ||
554 | * nodes. It is deliberate that the new zone lock is not taken | ||
555 | * as memory compaction should not move pages between nodes. | ||
556 | */ | ||
557 | page = pfn_to_page(low_pfn); | 612 | page = pfn_to_page(low_pfn); |
558 | if (page_zone(page) != zone) | ||
559 | continue; | ||
560 | 613 | ||
561 | if (!valid_page) | 614 | if (!valid_page) |
562 | valid_page = page; | 615 | valid_page = page; |
563 | 616 | ||
564 | /* If isolation recently failed, do not retry */ | 617 | /* |
565 | pageblock_nr = low_pfn >> pageblock_order; | 618 | * Skip if free. We read page order here without zone lock |
566 | if (last_pageblock_nr != pageblock_nr) { | 619 | * which is generally unsafe, but the race window is small and |
567 | int mt; | 620 | * the worst thing that can happen is that we skip some |
568 | 621 | * potential isolation targets. | |
569 | last_pageblock_nr = pageblock_nr; | 622 | */ |
570 | if (!isolation_suitable(cc, page)) | 623 | if (PageBuddy(page)) { |
571 | goto next_pageblock; | 624 | unsigned long freepage_order = page_order_unsafe(page); |
572 | 625 | ||
573 | /* | 626 | /* |
574 | * For async migration, also only scan in MOVABLE | 627 | * Without lock, we cannot be sure that what we got is |
575 | * blocks. Async migration is optimistic to see if | 628 | * a valid page order. Consider only values in the |
576 | * the minimum amount of work satisfies the allocation | 629 | * valid order range to prevent low_pfn overflow. |
577 | */ | 630 | */ |
578 | mt = get_pageblock_migratetype(page); | 631 | if (freepage_order > 0 && freepage_order < MAX_ORDER) |
579 | if (cc->mode == MIGRATE_ASYNC && | 632 | low_pfn += (1UL << freepage_order) - 1; |
580 | !migrate_async_suitable(mt)) { | ||
581 | set_unsuitable = false; | ||
582 | goto next_pageblock; | ||
583 | } | ||
584 | } | ||
585 | |||
586 | /* | ||
587 | * Skip if free. page_order cannot be used without zone->lock | ||
588 | * as nothing prevents parallel allocations or buddy merging. | ||
589 | */ | ||
590 | if (PageBuddy(page)) | ||
591 | continue; | 633 | continue; |
634 | } | ||
592 | 635 | ||
593 | /* | 636 | /* |
594 | * Check may be lockless but that's ok as we recheck later. | 637 | * Check may be lockless but that's ok as we recheck later. |
@@ -597,7 +640,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
597 | */ | 640 | */ |
598 | if (!PageLRU(page)) { | 641 | if (!PageLRU(page)) { |
599 | if (unlikely(balloon_page_movable(page))) { | 642 | if (unlikely(balloon_page_movable(page))) { |
600 | if (locked && balloon_page_isolate(page)) { | 643 | if (balloon_page_isolate(page)) { |
601 | /* Successfully isolated */ | 644 | /* Successfully isolated */ |
602 | goto isolate_success; | 645 | goto isolate_success; |
603 | } | 646 | } |
@@ -617,8 +660,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
617 | */ | 660 | */ |
618 | if (PageTransHuge(page)) { | 661 | if (PageTransHuge(page)) { |
619 | if (!locked) | 662 | if (!locked) |
620 | goto next_pageblock; | 663 | low_pfn = ALIGN(low_pfn + 1, |
621 | low_pfn += (1 << compound_order(page)) - 1; | 664 | pageblock_nr_pages) - 1; |
665 | else | ||
666 | low_pfn += (1 << compound_order(page)) - 1; | ||
667 | |||
622 | continue; | 668 | continue; |
623 | } | 669 | } |
624 | 670 | ||
@@ -631,24 +677,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
631 | page_count(page) > page_mapcount(page)) | 677 | page_count(page) > page_mapcount(page)) |
632 | continue; | 678 | continue; |
633 | 679 | ||
634 | /* Check if it is ok to still hold the lock */ | 680 | /* If we already hold the lock, we can skip some rechecking */ |
635 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | 681 | if (!locked) { |
636 | locked, cc); | 682 | locked = compact_trylock_irqsave(&zone->lru_lock, |
637 | if (!locked || fatal_signal_pending(current)) | 683 | &flags, cc); |
638 | break; | 684 | if (!locked) |
685 | break; | ||
639 | 686 | ||
640 | /* Recheck PageLRU and PageTransHuge under lock */ | 687 | /* Recheck PageLRU and PageTransHuge under lock */ |
641 | if (!PageLRU(page)) | 688 | if (!PageLRU(page)) |
642 | continue; | 689 | continue; |
643 | if (PageTransHuge(page)) { | 690 | if (PageTransHuge(page)) { |
644 | low_pfn += (1 << compound_order(page)) - 1; | 691 | low_pfn += (1 << compound_order(page)) - 1; |
645 | continue; | 692 | continue; |
693 | } | ||
646 | } | 694 | } |
647 | 695 | ||
648 | lruvec = mem_cgroup_page_lruvec(page, zone); | 696 | lruvec = mem_cgroup_page_lruvec(page, zone); |
649 | 697 | ||
650 | /* Try isolate the page */ | 698 | /* Try isolate the page */ |
651 | if (__isolate_lru_page(page, mode) != 0) | 699 | if (__isolate_lru_page(page, isolate_mode) != 0) |
652 | continue; | 700 | continue; |
653 | 701 | ||
654 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | 702 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
@@ -667,14 +715,14 @@ isolate_success: | |||
667 | ++low_pfn; | 715 | ++low_pfn; |
668 | break; | 716 | break; |
669 | } | 717 | } |
670 | |||
671 | continue; | ||
672 | |||
673 | next_pageblock: | ||
674 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; | ||
675 | } | 718 | } |
676 | 719 | ||
677 | acct_isolated(zone, locked, cc); | 720 | /* |
721 | * The PageBuddy() check could have potentially brought us outside | ||
722 | * the range to be scanned. | ||
723 | */ | ||
724 | if (unlikely(low_pfn > end_pfn)) | ||
725 | low_pfn = end_pfn; | ||
678 | 726 | ||
679 | if (locked) | 727 | if (locked) |
680 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 728 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -684,8 +732,7 @@ next_pageblock: | |||
684 | * if the whole pageblock was scanned without isolating any page. | 732 | * if the whole pageblock was scanned without isolating any page. |
685 | */ | 733 | */ |
686 | if (low_pfn == end_pfn) | 734 | if (low_pfn == end_pfn) |
687 | update_pageblock_skip(cc, valid_page, nr_isolated, | 735 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
688 | set_unsuitable, true); | ||
689 | 736 | ||
690 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 737 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
691 | 738 | ||
@@ -696,17 +743,65 @@ next_pageblock: | |||
696 | return low_pfn; | 743 | return low_pfn; |
697 | } | 744 | } |
698 | 745 | ||
746 | /** | ||
747 | * isolate_migratepages_range() - isolate migrate-able pages in a PFN range | ||
748 | * @cc: Compaction control structure. | ||
749 | * @start_pfn: The first PFN to start isolating. | ||
750 | * @end_pfn: The one-past-last PFN. | ||
751 | * | ||
752 | * Returns zero if isolation fails fatally due to e.g. pending signal. | ||
753 | * Otherwise, function returns one-past-the-last PFN of isolated page | ||
754 | * (which may be greater than end_pfn if end fell in a middle of a THP page). | ||
755 | */ | ||
756 | unsigned long | ||
757 | isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | ||
758 | unsigned long end_pfn) | ||
759 | { | ||
760 | unsigned long pfn, block_end_pfn; | ||
761 | |||
762 | /* Scan block by block. First and last block may be incomplete */ | ||
763 | pfn = start_pfn; | ||
764 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
765 | |||
766 | for (; pfn < end_pfn; pfn = block_end_pfn, | ||
767 | block_end_pfn += pageblock_nr_pages) { | ||
768 | |||
769 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
770 | |||
771 | if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) | ||
772 | continue; | ||
773 | |||
774 | pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, | ||
775 | ISOLATE_UNEVICTABLE); | ||
776 | |||
777 | /* | ||
778 | * In case of fatal failure, release everything that might | ||
779 | * have been isolated in the previous iteration, and signal | ||
780 | * the failure back to caller. | ||
781 | */ | ||
782 | if (!pfn) { | ||
783 | putback_movable_pages(&cc->migratepages); | ||
784 | cc->nr_migratepages = 0; | ||
785 | break; | ||
786 | } | ||
787 | } | ||
788 | acct_isolated(cc->zone, cc); | ||
789 | |||
790 | return pfn; | ||
791 | } | ||
792 | |||
699 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 793 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
700 | #ifdef CONFIG_COMPACTION | 794 | #ifdef CONFIG_COMPACTION |
701 | /* | 795 | /* |
702 | * Based on information in the current compact_control, find blocks | 796 | * Based on information in the current compact_control, find blocks |
703 | * suitable for isolating free pages from and then isolate them. | 797 | * suitable for isolating free pages from and then isolate them. |
704 | */ | 798 | */ |
705 | static void isolate_freepages(struct zone *zone, | 799 | static void isolate_freepages(struct compact_control *cc) |
706 | struct compact_control *cc) | ||
707 | { | 800 | { |
801 | struct zone *zone = cc->zone; | ||
708 | struct page *page; | 802 | struct page *page; |
709 | unsigned long block_start_pfn; /* start of current pageblock */ | 803 | unsigned long block_start_pfn; /* start of current pageblock */ |
804 | unsigned long isolate_start_pfn; /* exact pfn we start at */ | ||
710 | unsigned long block_end_pfn; /* end of current pageblock */ | 805 | unsigned long block_end_pfn; /* end of current pageblock */ |
711 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | 806 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ |
712 | int nr_freepages = cc->nr_freepages; | 807 | int nr_freepages = cc->nr_freepages; |
@@ -715,14 +810,15 @@ static void isolate_freepages(struct zone *zone, | |||
715 | /* | 810 | /* |
716 | * Initialise the free scanner. The starting point is where we last | 811 | * Initialise the free scanner. The starting point is where we last |
717 | * successfully isolated from, zone-cached value, or the end of the | 812 | * successfully isolated from, zone-cached value, or the end of the |
718 | * zone when isolating for the first time. We need this aligned to | 813 | * zone when isolating for the first time. For looping we also need |
719 | * the pageblock boundary, because we do | 814 | * this pfn aligned down to the pageblock boundary, because we do |
720 | * block_start_pfn -= pageblock_nr_pages in the for loop. | 815 | * block_start_pfn -= pageblock_nr_pages in the for loop. |
721 | * For ending point, take care when isolating in last pageblock of a | 816 | * For ending point, take care when isolating in last pageblock of a |
722 | * a zone which ends in the middle of a pageblock. | 817 | * a zone which ends in the middle of a pageblock. |
723 | * The low boundary is the end of the pageblock the migration scanner | 818 | * The low boundary is the end of the pageblock the migration scanner |
724 | * is using. | 819 | * is using. |
725 | */ | 820 | */ |
821 | isolate_start_pfn = cc->free_pfn; | ||
726 | block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); | 822 | block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); |
727 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, | 823 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, |
728 | zone_end_pfn(zone)); | 824 | zone_end_pfn(zone)); |
@@ -735,7 +831,8 @@ static void isolate_freepages(struct zone *zone, | |||
735 | */ | 831 | */ |
736 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; | 832 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
737 | block_end_pfn = block_start_pfn, | 833 | block_end_pfn = block_start_pfn, |
738 | block_start_pfn -= pageblock_nr_pages) { | 834 | block_start_pfn -= pageblock_nr_pages, |
835 | isolate_start_pfn = block_start_pfn) { | ||
739 | unsigned long isolated; | 836 | unsigned long isolated; |
740 | 837 | ||
741 | /* | 838 | /* |
@@ -747,18 +844,9 @@ static void isolate_freepages(struct zone *zone, | |||
747 | && compact_should_abort(cc)) | 844 | && compact_should_abort(cc)) |
748 | break; | 845 | break; |
749 | 846 | ||
750 | if (!pfn_valid(block_start_pfn)) | 847 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, |
751 | continue; | 848 | zone); |
752 | 849 | if (!page) | |
753 | /* | ||
754 | * Check for overlapping nodes/zones. It's possible on some | ||
755 | * configurations to have a setup like | ||
756 | * node0 node1 node0 | ||
757 | * i.e. it's possible that all pages within a zones range of | ||
758 | * pages do not belong to a single zone. | ||
759 | */ | ||
760 | page = pfn_to_page(block_start_pfn); | ||
761 | if (page_zone(page) != zone) | ||
762 | continue; | 850 | continue; |
763 | 851 | ||
764 | /* Check the block is suitable for migration */ | 852 | /* Check the block is suitable for migration */ |
@@ -769,13 +857,25 @@ static void isolate_freepages(struct zone *zone, | |||
769 | if (!isolation_suitable(cc, page)) | 857 | if (!isolation_suitable(cc, page)) |
770 | continue; | 858 | continue; |
771 | 859 | ||
772 | /* Found a block suitable for isolating free pages from */ | 860 | /* Found a block suitable for isolating free pages from. */ |
773 | cc->free_pfn = block_start_pfn; | 861 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, |
774 | isolated = isolate_freepages_block(cc, block_start_pfn, | ||
775 | block_end_pfn, freelist, false); | 862 | block_end_pfn, freelist, false); |
776 | nr_freepages += isolated; | 863 | nr_freepages += isolated; |
777 | 864 | ||
778 | /* | 865 | /* |
866 | * Remember where the free scanner should restart next time, | ||
867 | * which is where isolate_freepages_block() left off. | ||
868 | * But if it scanned the whole pageblock, isolate_start_pfn | ||
869 | * now points at block_end_pfn, which is the start of the next | ||
870 | * pageblock. | ||
871 | * In that case we will however want to restart at the start | ||
872 | * of the previous pageblock. | ||
873 | */ | ||
874 | cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? | ||
875 | isolate_start_pfn : | ||
876 | block_start_pfn - pageblock_nr_pages; | ||
877 | |||
878 | /* | ||
779 | * Set a flag that we successfully isolated in this pageblock. | 879 | * Set a flag that we successfully isolated in this pageblock. |
780 | * In the next loop iteration, zone->compact_cached_free_pfn | 880 | * In the next loop iteration, zone->compact_cached_free_pfn |
781 | * will not be updated and thus it will effectively contain the | 881 | * will not be updated and thus it will effectively contain the |
@@ -822,7 +922,7 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
822 | */ | 922 | */ |
823 | if (list_empty(&cc->freepages)) { | 923 | if (list_empty(&cc->freepages)) { |
824 | if (!cc->contended) | 924 | if (!cc->contended) |
825 | isolate_freepages(cc->zone, cc); | 925 | isolate_freepages(cc); |
826 | 926 | ||
827 | if (list_empty(&cc->freepages)) | 927 | if (list_empty(&cc->freepages)) |
828 | return NULL; | 928 | return NULL; |
@@ -856,38 +956,84 @@ typedef enum { | |||
856 | } isolate_migrate_t; | 956 | } isolate_migrate_t; |
857 | 957 | ||
858 | /* | 958 | /* |
859 | * Isolate all pages that can be migrated from the block pointed to by | 959 | * Isolate all pages that can be migrated from the first suitable block, |
860 | * the migrate scanner within compact_control. | 960 | * starting at the block pointed to by the migrate scanner pfn within |
961 | * compact_control. | ||
861 | */ | 962 | */ |
862 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | 963 | static isolate_migrate_t isolate_migratepages(struct zone *zone, |
863 | struct compact_control *cc) | 964 | struct compact_control *cc) |
864 | { | 965 | { |
865 | unsigned long low_pfn, end_pfn; | 966 | unsigned long low_pfn, end_pfn; |
967 | struct page *page; | ||
968 | const isolate_mode_t isolate_mode = | ||
969 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); | ||
866 | 970 | ||
867 | /* Do not scan outside zone boundaries */ | 971 | /* |
868 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 972 | * Start at where we last stopped, or beginning of the zone as |
973 | * initialized by compact_zone() | ||
974 | */ | ||
975 | low_pfn = cc->migrate_pfn; | ||
869 | 976 | ||
870 | /* Only scan within a pageblock boundary */ | 977 | /* Only scan within a pageblock boundary */ |
871 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); | 978 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); |
872 | 979 | ||
873 | /* Do not cross the free scanner or scan within a memory hole */ | 980 | /* |
874 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 981 | * Iterate over whole pageblocks until we find the first suitable. |
875 | cc->migrate_pfn = end_pfn; | 982 | * Do not cross the free scanner. |
876 | return ISOLATE_NONE; | 983 | */ |
877 | } | 984 | for (; end_pfn <= cc->free_pfn; |
985 | low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { | ||
878 | 986 | ||
879 | /* Perform the isolation */ | 987 | /* |
880 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); | 988 | * This can potentially iterate a massively long zone with |
881 | if (!low_pfn || cc->contended) | 989 | * many pageblocks unsuitable, so periodically check if we |
882 | return ISOLATE_ABORT; | 990 | * need to schedule, or even abort async compaction. |
991 | */ | ||
992 | if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) | ||
993 | && compact_should_abort(cc)) | ||
994 | break; | ||
995 | |||
996 | page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); | ||
997 | if (!page) | ||
998 | continue; | ||
999 | |||
1000 | /* If isolation recently failed, do not retry */ | ||
1001 | if (!isolation_suitable(cc, page)) | ||
1002 | continue; | ||
1003 | |||
1004 | /* | ||
1005 | * For async compaction, also only scan in MOVABLE blocks. | ||
1006 | * Async compaction is optimistic to see if the minimum amount | ||
1007 | * of work satisfies the allocation. | ||
1008 | */ | ||
1009 | if (cc->mode == MIGRATE_ASYNC && | ||
1010 | !migrate_async_suitable(get_pageblock_migratetype(page))) | ||
1011 | continue; | ||
1012 | |||
1013 | /* Perform the isolation */ | ||
1014 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, | ||
1015 | isolate_mode); | ||
883 | 1016 | ||
1017 | if (!low_pfn || cc->contended) | ||
1018 | return ISOLATE_ABORT; | ||
1019 | |||
1020 | /* | ||
1021 | * Either we isolated something and proceed with migration. Or | ||
1022 | * we failed and compact_zone should decide if we should | ||
1023 | * continue or not. | ||
1024 | */ | ||
1025 | break; | ||
1026 | } | ||
1027 | |||
1028 | acct_isolated(zone, cc); | ||
1029 | /* Record where migration scanner will be restarted */ | ||
884 | cc->migrate_pfn = low_pfn; | 1030 | cc->migrate_pfn = low_pfn; |
885 | 1031 | ||
886 | return ISOLATE_SUCCESS; | 1032 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; |
887 | } | 1033 | } |
888 | 1034 | ||
889 | static int compact_finished(struct zone *zone, | 1035 | static int compact_finished(struct zone *zone, struct compact_control *cc, |
890 | struct compact_control *cc) | 1036 | const int migratetype) |
891 | { | 1037 | { |
892 | unsigned int order; | 1038 | unsigned int order; |
893 | unsigned long watermark; | 1039 | unsigned long watermark; |
@@ -933,7 +1079,7 @@ static int compact_finished(struct zone *zone, | |||
933 | struct free_area *area = &zone->free_area[order]; | 1079 | struct free_area *area = &zone->free_area[order]; |
934 | 1080 | ||
935 | /* Job done if page is free of the right migratetype */ | 1081 | /* Job done if page is free of the right migratetype */ |
936 | if (!list_empty(&area->free_list[cc->migratetype])) | 1082 | if (!list_empty(&area->free_list[migratetype])) |
937 | return COMPACT_PARTIAL; | 1083 | return COMPACT_PARTIAL; |
938 | 1084 | ||
939 | /* Job done if allocation would set block type */ | 1085 | /* Job done if allocation would set block type */ |
@@ -999,6 +1145,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
999 | int ret; | 1145 | int ret; |
1000 | unsigned long start_pfn = zone->zone_start_pfn; | 1146 | unsigned long start_pfn = zone->zone_start_pfn; |
1001 | unsigned long end_pfn = zone_end_pfn(zone); | 1147 | unsigned long end_pfn = zone_end_pfn(zone); |
1148 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); | ||
1002 | const bool sync = cc->mode != MIGRATE_ASYNC; | 1149 | const bool sync = cc->mode != MIGRATE_ASYNC; |
1003 | 1150 | ||
1004 | ret = compaction_suitable(zone, cc->order); | 1151 | ret = compaction_suitable(zone, cc->order); |
@@ -1041,7 +1188,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1041 | 1188 | ||
1042 | migrate_prep_local(); | 1189 | migrate_prep_local(); |
1043 | 1190 | ||
1044 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 1191 | while ((ret = compact_finished(zone, cc, migratetype)) == |
1192 | COMPACT_CONTINUE) { | ||
1045 | int err; | 1193 | int err; |
1046 | 1194 | ||
1047 | switch (isolate_migratepages(zone, cc)) { | 1195 | switch (isolate_migratepages(zone, cc)) { |
@@ -1056,9 +1204,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1056 | ; | 1204 | ; |
1057 | } | 1205 | } |
1058 | 1206 | ||
1059 | if (!cc->nr_migratepages) | ||
1060 | continue; | ||
1061 | |||
1062 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1207 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1063 | compaction_free, (unsigned long)cc, cc->mode, | 1208 | compaction_free, (unsigned long)cc, cc->mode, |
1064 | MR_COMPACTION); | 1209 | MR_COMPACTION); |
@@ -1092,14 +1237,14 @@ out: | |||
1092 | } | 1237 | } |
1093 | 1238 | ||
1094 | static unsigned long compact_zone_order(struct zone *zone, int order, | 1239 | static unsigned long compact_zone_order(struct zone *zone, int order, |
1095 | gfp_t gfp_mask, enum migrate_mode mode, bool *contended) | 1240 | gfp_t gfp_mask, enum migrate_mode mode, int *contended) |
1096 | { | 1241 | { |
1097 | unsigned long ret; | 1242 | unsigned long ret; |
1098 | struct compact_control cc = { | 1243 | struct compact_control cc = { |
1099 | .nr_freepages = 0, | 1244 | .nr_freepages = 0, |
1100 | .nr_migratepages = 0, | 1245 | .nr_migratepages = 0, |
1101 | .order = order, | 1246 | .order = order, |
1102 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1247 | .gfp_mask = gfp_mask, |
1103 | .zone = zone, | 1248 | .zone = zone, |
1104 | .mode = mode, | 1249 | .mode = mode, |
1105 | }; | 1250 | }; |
@@ -1124,48 +1269,117 @@ int sysctl_extfrag_threshold = 500; | |||
1124 | * @gfp_mask: The GFP mask of the current allocation | 1269 | * @gfp_mask: The GFP mask of the current allocation |
1125 | * @nodemask: The allowed nodes to allocate from | 1270 | * @nodemask: The allowed nodes to allocate from |
1126 | * @mode: The migration mode for async, sync light, or sync migration | 1271 | * @mode: The migration mode for async, sync light, or sync migration |
1127 | * @contended: Return value that is true if compaction was aborted due to lock contention | 1272 | * @contended: Return value that determines if compaction was aborted due to |
1128 | * @page: Optionally capture a free page of the requested order during compaction | 1273 | * need_resched() or lock contention |
1274 | * @candidate_zone: Return the zone where we think allocation should succeed | ||
1129 | * | 1275 | * |
1130 | * This is the main entry point for direct page compaction. | 1276 | * This is the main entry point for direct page compaction. |
1131 | */ | 1277 | */ |
1132 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1278 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1133 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1279 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1134 | enum migrate_mode mode, bool *contended) | 1280 | enum migrate_mode mode, int *contended, |
1281 | struct zone **candidate_zone) | ||
1135 | { | 1282 | { |
1136 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1283 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1137 | int may_enter_fs = gfp_mask & __GFP_FS; | 1284 | int may_enter_fs = gfp_mask & __GFP_FS; |
1138 | int may_perform_io = gfp_mask & __GFP_IO; | 1285 | int may_perform_io = gfp_mask & __GFP_IO; |
1139 | struct zoneref *z; | 1286 | struct zoneref *z; |
1140 | struct zone *zone; | 1287 | struct zone *zone; |
1141 | int rc = COMPACT_SKIPPED; | 1288 | int rc = COMPACT_DEFERRED; |
1142 | int alloc_flags = 0; | 1289 | int alloc_flags = 0; |
1290 | int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ | ||
1291 | |||
1292 | *contended = COMPACT_CONTENDED_NONE; | ||
1143 | 1293 | ||
1144 | /* Check if the GFP flags allow compaction */ | 1294 | /* Check if the GFP flags allow compaction */ |
1145 | if (!order || !may_enter_fs || !may_perform_io) | 1295 | if (!order || !may_enter_fs || !may_perform_io) |
1146 | return rc; | 1296 | return COMPACT_SKIPPED; |
1147 | |||
1148 | count_compact_event(COMPACTSTALL); | ||
1149 | 1297 | ||
1150 | #ifdef CONFIG_CMA | 1298 | #ifdef CONFIG_CMA |
1151 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 1299 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
1152 | alloc_flags |= ALLOC_CMA; | 1300 | alloc_flags |= ALLOC_CMA; |
1153 | #endif | 1301 | #endif |
1154 | /* Compact each zone in the list */ | 1302 | /* Compact each zone in the list */ |
1155 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1303 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
1156 | nodemask) { | 1304 | nodemask) { |
1157 | int status; | 1305 | int status; |
1306 | int zone_contended; | ||
1307 | |||
1308 | if (compaction_deferred(zone, order)) | ||
1309 | continue; | ||
1158 | 1310 | ||
1159 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1311 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1160 | contended); | 1312 | &zone_contended); |
1161 | rc = max(status, rc); | 1313 | rc = max(status, rc); |
1314 | /* | ||
1315 | * It takes at least one zone that wasn't lock contended | ||
1316 | * to clear all_zones_contended. | ||
1317 | */ | ||
1318 | all_zones_contended &= zone_contended; | ||
1162 | 1319 | ||
1163 | /* If a normal allocation would succeed, stop compacting */ | 1320 | /* If a normal allocation would succeed, stop compacting */ |
1164 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, | 1321 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, |
1165 | alloc_flags)) | 1322 | alloc_flags)) { |
1166 | break; | 1323 | *candidate_zone = zone; |
1324 | /* | ||
1325 | * We think the allocation will succeed in this zone, | ||
1326 | * but it is not certain, hence the false. The caller | ||
1327 | * will repeat this with true if allocation indeed | ||
1328 | * succeeds in this zone. | ||
1329 | */ | ||
1330 | compaction_defer_reset(zone, order, false); | ||
1331 | /* | ||
1332 | * It is possible that async compaction aborted due to | ||
1333 | * need_resched() and the watermarks were ok thanks to | ||
1334 | * somebody else freeing memory. The allocation can | ||
1335 | * however still fail so we better signal the | ||
1336 | * need_resched() contention anyway (this will not | ||
1337 | * prevent the allocation attempt). | ||
1338 | */ | ||
1339 | if (zone_contended == COMPACT_CONTENDED_SCHED) | ||
1340 | *contended = COMPACT_CONTENDED_SCHED; | ||
1341 | |||
1342 | goto break_loop; | ||
1343 | } | ||
1344 | |||
1345 | if (mode != MIGRATE_ASYNC) { | ||
1346 | /* | ||
1347 | * We think that allocation won't succeed in this zone | ||
1348 | * so we defer compaction there. If it ends up | ||
1349 | * succeeding after all, it will be reset. | ||
1350 | */ | ||
1351 | defer_compaction(zone, order); | ||
1352 | } | ||
1353 | |||
1354 | /* | ||
1355 | * We might have stopped compacting due to need_resched() in | ||
1356 | * async compaction, or due to a fatal signal detected. In that | ||
1357 | * case do not try further zones and signal need_resched() | ||
1358 | * contention. | ||
1359 | */ | ||
1360 | if ((zone_contended == COMPACT_CONTENDED_SCHED) | ||
1361 | || fatal_signal_pending(current)) { | ||
1362 | *contended = COMPACT_CONTENDED_SCHED; | ||
1363 | goto break_loop; | ||
1364 | } | ||
1365 | |||
1366 | continue; | ||
1367 | break_loop: | ||
1368 | /* | ||
1369 | * We might not have tried all the zones, so be conservative | ||
1370 | * and assume they are not all lock contended. | ||
1371 | */ | ||
1372 | all_zones_contended = 0; | ||
1373 | break; | ||
1167 | } | 1374 | } |
1168 | 1375 | ||
1376 | /* | ||
1377 | * If at least one zone wasn't deferred or skipped, we report if all | ||
1378 | * zones that were tried were lock contended. | ||
1379 | */ | ||
1380 | if (rc > COMPACT_SKIPPED && all_zones_contended) | ||
1381 | *contended = COMPACT_CONTENDED_LOCK; | ||
1382 | |||
1169 | return rc; | 1383 | return rc; |
1170 | } | 1384 | } |
1171 | 1385 | ||
diff --git a/mm/debug.c b/mm/debug.c new file mode 100644 index 000000000000..5ce45c9a29b5 --- /dev/null +++ b/mm/debug.c | |||
@@ -0,0 +1,237 @@ | |||
1 | /* | ||
2 | * mm/debug.c | ||
3 | * | ||
4 | * mm/ specific debug routines. | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/ftrace_event.h> | ||
11 | #include <linux/memcontrol.h> | ||
12 | |||
13 | static const struct trace_print_flags pageflag_names[] = { | ||
14 | {1UL << PG_locked, "locked" }, | ||
15 | {1UL << PG_error, "error" }, | ||
16 | {1UL << PG_referenced, "referenced" }, | ||
17 | {1UL << PG_uptodate, "uptodate" }, | ||
18 | {1UL << PG_dirty, "dirty" }, | ||
19 | {1UL << PG_lru, "lru" }, | ||
20 | {1UL << PG_active, "active" }, | ||
21 | {1UL << PG_slab, "slab" }, | ||
22 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
23 | {1UL << PG_arch_1, "arch_1" }, | ||
24 | {1UL << PG_reserved, "reserved" }, | ||
25 | {1UL << PG_private, "private" }, | ||
26 | {1UL << PG_private_2, "private_2" }, | ||
27 | {1UL << PG_writeback, "writeback" }, | ||
28 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
29 | {1UL << PG_head, "head" }, | ||
30 | {1UL << PG_tail, "tail" }, | ||
31 | #else | ||
32 | {1UL << PG_compound, "compound" }, | ||
33 | #endif | ||
34 | {1UL << PG_swapcache, "swapcache" }, | ||
35 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
36 | {1UL << PG_reclaim, "reclaim" }, | ||
37 | {1UL << PG_swapbacked, "swapbacked" }, | ||
38 | {1UL << PG_unevictable, "unevictable" }, | ||
39 | #ifdef CONFIG_MMU | ||
40 | {1UL << PG_mlocked, "mlocked" }, | ||
41 | #endif | ||
42 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
43 | {1UL << PG_uncached, "uncached" }, | ||
44 | #endif | ||
45 | #ifdef CONFIG_MEMORY_FAILURE | ||
46 | {1UL << PG_hwpoison, "hwpoison" }, | ||
47 | #endif | ||
48 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
49 | {1UL << PG_compound_lock, "compound_lock" }, | ||
50 | #endif | ||
51 | }; | ||
52 | |||
53 | static void dump_flags(unsigned long flags, | ||
54 | const struct trace_print_flags *names, int count) | ||
55 | { | ||
56 | const char *delim = ""; | ||
57 | unsigned long mask; | ||
58 | int i; | ||
59 | |||
60 | pr_emerg("flags: %#lx(", flags); | ||
61 | |||
62 | /* remove zone id */ | ||
63 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
64 | |||
65 | for (i = 0; i < count && flags; i++) { | ||
66 | |||
67 | mask = names[i].mask; | ||
68 | if ((flags & mask) != mask) | ||
69 | continue; | ||
70 | |||
71 | flags &= ~mask; | ||
72 | pr_cont("%s%s", delim, names[i].name); | ||
73 | delim = "|"; | ||
74 | } | ||
75 | |||
76 | /* check for left over flags */ | ||
77 | if (flags) | ||
78 | pr_cont("%s%#lx", delim, flags); | ||
79 | |||
80 | pr_cont(")\n"); | ||
81 | } | ||
82 | |||
83 | void dump_page_badflags(struct page *page, const char *reason, | ||
84 | unsigned long badflags) | ||
85 | { | ||
86 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
87 | page, atomic_read(&page->_count), page_mapcount(page), | ||
88 | page->mapping, page->index); | ||
89 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
90 | dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); | ||
91 | if (reason) | ||
92 | pr_alert("page dumped because: %s\n", reason); | ||
93 | if (page->flags & badflags) { | ||
94 | pr_alert("bad because of flags:\n"); | ||
95 | dump_flags(page->flags & badflags, | ||
96 | pageflag_names, ARRAY_SIZE(pageflag_names)); | ||
97 | } | ||
98 | mem_cgroup_print_bad_page(page); | ||
99 | } | ||
100 | |||
101 | void dump_page(struct page *page, const char *reason) | ||
102 | { | ||
103 | dump_page_badflags(page, reason, 0); | ||
104 | } | ||
105 | EXPORT_SYMBOL(dump_page); | ||
106 | |||
107 | #ifdef CONFIG_DEBUG_VM | ||
108 | |||
109 | static const struct trace_print_flags vmaflags_names[] = { | ||
110 | {VM_READ, "read" }, | ||
111 | {VM_WRITE, "write" }, | ||
112 | {VM_EXEC, "exec" }, | ||
113 | {VM_SHARED, "shared" }, | ||
114 | {VM_MAYREAD, "mayread" }, | ||
115 | {VM_MAYWRITE, "maywrite" }, | ||
116 | {VM_MAYEXEC, "mayexec" }, | ||
117 | {VM_MAYSHARE, "mayshare" }, | ||
118 | {VM_GROWSDOWN, "growsdown" }, | ||
119 | {VM_PFNMAP, "pfnmap" }, | ||
120 | {VM_DENYWRITE, "denywrite" }, | ||
121 | {VM_LOCKED, "locked" }, | ||
122 | {VM_IO, "io" }, | ||
123 | {VM_SEQ_READ, "seqread" }, | ||
124 | {VM_RAND_READ, "randread" }, | ||
125 | {VM_DONTCOPY, "dontcopy" }, | ||
126 | {VM_DONTEXPAND, "dontexpand" }, | ||
127 | {VM_ACCOUNT, "account" }, | ||
128 | {VM_NORESERVE, "noreserve" }, | ||
129 | {VM_HUGETLB, "hugetlb" }, | ||
130 | {VM_NONLINEAR, "nonlinear" }, | ||
131 | #if defined(CONFIG_X86) | ||
132 | {VM_PAT, "pat" }, | ||
133 | #elif defined(CONFIG_PPC) | ||
134 | {VM_SAO, "sao" }, | ||
135 | #elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) | ||
136 | {VM_GROWSUP, "growsup" }, | ||
137 | #elif !defined(CONFIG_MMU) | ||
138 | {VM_MAPPED_COPY, "mappedcopy" }, | ||
139 | #else | ||
140 | {VM_ARCH_1, "arch_1" }, | ||
141 | #endif | ||
142 | {VM_DONTDUMP, "dontdump" }, | ||
143 | #ifdef CONFIG_MEM_SOFT_DIRTY | ||
144 | {VM_SOFTDIRTY, "softdirty" }, | ||
145 | #endif | ||
146 | {VM_MIXEDMAP, "mixedmap" }, | ||
147 | {VM_HUGEPAGE, "hugepage" }, | ||
148 | {VM_NOHUGEPAGE, "nohugepage" }, | ||
149 | {VM_MERGEABLE, "mergeable" }, | ||
150 | }; | ||
151 | |||
152 | void dump_vma(const struct vm_area_struct *vma) | ||
153 | { | ||
154 | pr_emerg("vma %p start %p end %p\n" | ||
155 | "next %p prev %p mm %p\n" | ||
156 | "prot %lx anon_vma %p vm_ops %p\n" | ||
157 | "pgoff %lx file %p private_data %p\n", | ||
158 | vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, | ||
159 | vma->vm_prev, vma->vm_mm, | ||
160 | (unsigned long)pgprot_val(vma->vm_page_prot), | ||
161 | vma->anon_vma, vma->vm_ops, vma->vm_pgoff, | ||
162 | vma->vm_file, vma->vm_private_data); | ||
163 | dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); | ||
164 | } | ||
165 | EXPORT_SYMBOL(dump_vma); | ||
166 | |||
167 | void dump_mm(const struct mm_struct *mm) | ||
168 | { | ||
169 | pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" | ||
170 | #ifdef CONFIG_MMU | ||
171 | "get_unmapped_area %p\n" | ||
172 | #endif | ||
173 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" | ||
174 | "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" | ||
175 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" | ||
176 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" | ||
177 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" | ||
178 | "start_brk %lx brk %lx start_stack %lx\n" | ||
179 | "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" | ||
180 | "binfmt %p flags %lx core_state %p\n" | ||
181 | #ifdef CONFIG_AIO | ||
182 | "ioctx_table %p\n" | ||
183 | #endif | ||
184 | #ifdef CONFIG_MEMCG | ||
185 | "owner %p " | ||
186 | #endif | ||
187 | "exe_file %p\n" | ||
188 | #ifdef CONFIG_MMU_NOTIFIER | ||
189 | "mmu_notifier_mm %p\n" | ||
190 | #endif | ||
191 | #ifdef CONFIG_NUMA_BALANCING | ||
192 | "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" | ||
193 | #endif | ||
194 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) | ||
195 | "tlb_flush_pending %d\n" | ||
196 | #endif | ||
197 | "%s", /* This is here to hold the comma */ | ||
198 | |||
199 | mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, | ||
200 | #ifdef CONFIG_MMU | ||
201 | mm->get_unmapped_area, | ||
202 | #endif | ||
203 | mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, | ||
204 | mm->pgd, atomic_read(&mm->mm_users), | ||
205 | atomic_read(&mm->mm_count), | ||
206 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), | ||
207 | mm->map_count, | ||
208 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, | ||
209 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, | ||
210 | mm->start_code, mm->end_code, mm->start_data, mm->end_data, | ||
211 | mm->start_brk, mm->brk, mm->start_stack, | ||
212 | mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, | ||
213 | mm->binfmt, mm->flags, mm->core_state, | ||
214 | #ifdef CONFIG_AIO | ||
215 | mm->ioctx_table, | ||
216 | #endif | ||
217 | #ifdef CONFIG_MEMCG | ||
218 | mm->owner, | ||
219 | #endif | ||
220 | mm->exe_file, | ||
221 | #ifdef CONFIG_MMU_NOTIFIER | ||
222 | mm->mmu_notifier_mm, | ||
223 | #endif | ||
224 | #ifdef CONFIG_NUMA_BALANCING | ||
225 | mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, | ||
226 | #endif | ||
227 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) | ||
228 | mm->tlb_flush_pending, | ||
229 | #endif | ||
230 | "" /* This is here to not have a comma! */ | ||
231 | ); | ||
232 | |||
233 | dump_flags(mm->def_flags, vmaflags_names, | ||
234 | ARRAY_SIZE(vmaflags_names)); | ||
235 | } | ||
236 | |||
237 | #endif /* CONFIG_DEBUG_VM */ | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c index 306baa594f95..fd5fe4342e93 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -62,6 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ | |||
62 | }; | 62 | }; |
63 | 63 | ||
64 | static DEFINE_MUTEX(pools_lock); | 64 | static DEFINE_MUTEX(pools_lock); |
65 | static DEFINE_MUTEX(pools_reg_lock); | ||
65 | 66 | ||
66 | static ssize_t | 67 | static ssize_t |
67 | show_pools(struct device *dev, struct device_attribute *attr, char *buf) | 68 | show_pools(struct device *dev, struct device_attribute *attr, char *buf) |
@@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
132 | { | 133 | { |
133 | struct dma_pool *retval; | 134 | struct dma_pool *retval; |
134 | size_t allocation; | 135 | size_t allocation; |
136 | bool empty = false; | ||
135 | 137 | ||
136 | if (align == 0) { | 138 | if (align == 0) |
137 | align = 1; | 139 | align = 1; |
138 | } else if (align & (align - 1)) { | 140 | else if (align & (align - 1)) |
139 | return NULL; | 141 | return NULL; |
140 | } | ||
141 | 142 | ||
142 | if (size == 0) { | 143 | if (size == 0) |
143 | return NULL; | 144 | return NULL; |
144 | } else if (size < 4) { | 145 | else if (size < 4) |
145 | size = 4; | 146 | size = 4; |
146 | } | ||
147 | 147 | ||
148 | if ((size % align) != 0) | 148 | if ((size % align) != 0) |
149 | size = ALIGN(size, align); | 149 | size = ALIGN(size, align); |
150 | 150 | ||
151 | allocation = max_t(size_t, size, PAGE_SIZE); | 151 | allocation = max_t(size_t, size, PAGE_SIZE); |
152 | 152 | ||
153 | if (!boundary) { | 153 | if (!boundary) |
154 | boundary = allocation; | 154 | boundary = allocation; |
155 | } else if ((boundary < size) || (boundary & (boundary - 1))) { | 155 | else if ((boundary < size) || (boundary & (boundary - 1))) |
156 | return NULL; | 156 | return NULL; |
157 | } | ||
158 | 157 | ||
159 | retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); | 158 | retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); |
160 | if (!retval) | 159 | if (!retval) |
@@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
172 | 171 | ||
173 | INIT_LIST_HEAD(&retval->pools); | 172 | INIT_LIST_HEAD(&retval->pools); |
174 | 173 | ||
174 | /* | ||
175 | * pools_lock ensures that the ->dma_pools list does not get corrupted. | ||
176 | * pools_reg_lock ensures that there is not a race between | ||
177 | * dma_pool_create() and dma_pool_destroy() or within dma_pool_create() | ||
178 | * when the first invocation of dma_pool_create() failed on | ||
179 | * device_create_file() and the second assumes that it has been done (I | ||
180 | * know it is a short window). | ||
181 | */ | ||
182 | mutex_lock(&pools_reg_lock); | ||
175 | mutex_lock(&pools_lock); | 183 | mutex_lock(&pools_lock); |
176 | if (list_empty(&dev->dma_pools) && | 184 | if (list_empty(&dev->dma_pools)) |
177 | device_create_file(dev, &dev_attr_pools)) { | 185 | empty = true; |
178 | kfree(retval); | 186 | list_add(&retval->pools, &dev->dma_pools); |
179 | return NULL; | ||
180 | } else | ||
181 | list_add(&retval->pools, &dev->dma_pools); | ||
182 | mutex_unlock(&pools_lock); | 187 | mutex_unlock(&pools_lock); |
183 | 188 | if (empty) { | |
189 | int err; | ||
190 | |||
191 | err = device_create_file(dev, &dev_attr_pools); | ||
192 | if (err) { | ||
193 | mutex_lock(&pools_lock); | ||
194 | list_del(&retval->pools); | ||
195 | mutex_unlock(&pools_lock); | ||
196 | mutex_unlock(&pools_reg_lock); | ||
197 | kfree(retval); | ||
198 | return NULL; | ||
199 | } | ||
200 | } | ||
201 | mutex_unlock(&pools_reg_lock); | ||
184 | return retval; | 202 | return retval; |
185 | } | 203 | } |
186 | EXPORT_SYMBOL(dma_pool_create); | 204 | EXPORT_SYMBOL(dma_pool_create); |
@@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) | |||
251 | */ | 269 | */ |
252 | void dma_pool_destroy(struct dma_pool *pool) | 270 | void dma_pool_destroy(struct dma_pool *pool) |
253 | { | 271 | { |
272 | bool empty = false; | ||
273 | |||
274 | mutex_lock(&pools_reg_lock); | ||
254 | mutex_lock(&pools_lock); | 275 | mutex_lock(&pools_lock); |
255 | list_del(&pool->pools); | 276 | list_del(&pool->pools); |
256 | if (pool->dev && list_empty(&pool->dev->dma_pools)) | 277 | if (pool->dev && list_empty(&pool->dev->dma_pools)) |
257 | device_remove_file(pool->dev, &dev_attr_pools); | 278 | empty = true; |
258 | mutex_unlock(&pools_lock); | 279 | mutex_unlock(&pools_lock); |
280 | if (empty) | ||
281 | device_remove_file(pool->dev, &dev_attr_pools); | ||
282 | mutex_unlock(&pools_reg_lock); | ||
259 | 283 | ||
260 | while (!list_empty(&pool->page_list)) { | 284 | while (!list_empty(&pool->page_list)) { |
261 | struct dma_page *page; | 285 | struct dma_page *page; |
diff --git a/mm/filemap.c b/mm/filemap.c index 90effcdf948d..14b4642279f1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc); | |||
670 | * at a cost of "thundering herd" phenomena during rare hash | 670 | * at a cost of "thundering herd" phenomena during rare hash |
671 | * collisions. | 671 | * collisions. |
672 | */ | 672 | */ |
673 | static wait_queue_head_t *page_waitqueue(struct page *page) | 673 | wait_queue_head_t *page_waitqueue(struct page *page) |
674 | { | 674 | { |
675 | const struct zone *zone = page_zone(page); | 675 | const struct zone *zone = page_zone(page); |
676 | 676 | ||
677 | return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; | 677 | return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; |
678 | } | 678 | } |
679 | 679 | EXPORT_SYMBOL(page_waitqueue); | |
680 | static inline void wake_up_page(struct page *page, int bit) | ||
681 | { | ||
682 | __wake_up_bit(page_waitqueue(page), &page->flags, bit); | ||
683 | } | ||
684 | 680 | ||
685 | void wait_on_page_bit(struct page *page, int bit_nr) | 681 | void wait_on_page_bit(struct page *page, int bit_nr) |
686 | { | 682 | { |
@@ -703,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) | |||
703 | bit_wait_io, TASK_KILLABLE); | 699 | bit_wait_io, TASK_KILLABLE); |
704 | } | 700 | } |
705 | 701 | ||
702 | int wait_on_page_bit_killable_timeout(struct page *page, | ||
703 | int bit_nr, unsigned long timeout) | ||
704 | { | ||
705 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | ||
706 | |||
707 | wait.key.timeout = jiffies + timeout; | ||
708 | if (!test_bit(bit_nr, &page->flags)) | ||
709 | return 0; | ||
710 | return __wait_on_bit(page_waitqueue(page), &wait, | ||
711 | bit_wait_io_timeout, TASK_KILLABLE); | ||
712 | } | ||
713 | EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); | ||
714 | |||
706 | /** | 715 | /** |
707 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue | 716 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue |
708 | * @page: Page defining the wait queue of interest | 717 | * @page: Page defining the wait queue of interest |
@@ -727,7 +736,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); | |||
727 | * | 736 | * |
728 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). | 737 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). |
729 | * Also wakes sleepers in wait_on_page_writeback() because the wakeup | 738 | * Also wakes sleepers in wait_on_page_writeback() because the wakeup |
730 | * mechananism between PageLocked pages and PageWriteback pages is shared. | 739 | * mechanism between PageLocked pages and PageWriteback pages is shared. |
731 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 740 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
732 | * | 741 | * |
733 | * The mb is necessary to enforce ordering between the clear_bit and the read | 742 | * The mb is necessary to enforce ordering between the clear_bit and the read |
@@ -1744,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter); | |||
1744 | static int page_cache_read(struct file *file, pgoff_t offset) | 1753 | static int page_cache_read(struct file *file, pgoff_t offset) |
1745 | { | 1754 | { |
1746 | struct address_space *mapping = file->f_mapping; | 1755 | struct address_space *mapping = file->f_mapping; |
1747 | struct page *page; | 1756 | struct page *page; |
1748 | int ret; | 1757 | int ret; |
1749 | 1758 | ||
1750 | do { | 1759 | do { |
@@ -1761,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
1761 | page_cache_release(page); | 1770 | page_cache_release(page); |
1762 | 1771 | ||
1763 | } while (ret == AOP_TRUNCATED_PAGE); | 1772 | } while (ret == AOP_TRUNCATED_PAGE); |
1764 | 1773 | ||
1765 | return ret; | 1774 | return ret; |
1766 | } | 1775 | } |
1767 | 1776 | ||
@@ -10,6 +10,10 @@ | |||
10 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
11 | #include <linux/swapops.h> | 11 | #include <linux/swapops.h> |
12 | 12 | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/rwsem.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | |||
13 | #include "internal.h" | 17 | #include "internal.h" |
14 | 18 | ||
15 | static struct page *no_page_table(struct vm_area_struct *vma, | 19 | static struct page *no_page_table(struct vm_area_struct *vma, |
@@ -281,6 +285,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | |||
281 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | 285 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; |
282 | if (*flags & FOLL_NOWAIT) | 286 | if (*flags & FOLL_NOWAIT) |
283 | fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; | 287 | fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; |
288 | if (*flags & FOLL_TRIED) { | ||
289 | VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); | ||
290 | fault_flags |= FAULT_FLAG_TRIED; | ||
291 | } | ||
284 | 292 | ||
285 | ret = handle_mm_fault(mm, vma, address, fault_flags); | 293 | ret = handle_mm_fault(mm, vma, address, fault_flags); |
286 | if (ret & VM_FAULT_ERROR) { | 294 | if (ret & VM_FAULT_ERROR) { |
@@ -672,3 +680,353 @@ struct page *get_dump_page(unsigned long addr) | |||
672 | return page; | 680 | return page; |
673 | } | 681 | } |
674 | #endif /* CONFIG_ELF_CORE */ | 682 | #endif /* CONFIG_ELF_CORE */ |
683 | |||
684 | /* | ||
685 | * Generic RCU Fast GUP | ||
686 | * | ||
687 | * get_user_pages_fast attempts to pin user pages by walking the page | ||
688 | * tables directly and avoids taking locks. Thus the walker needs to be | ||
689 | * protected from page table pages being freed from under it, and should | ||
690 | * block any THP splits. | ||
691 | * | ||
692 | * One way to achieve this is to have the walker disable interrupts, and | ||
693 | * rely on IPIs from the TLB flushing code blocking before the page table | ||
694 | * pages are freed. This is unsuitable for architectures that do not need | ||
695 | * to broadcast an IPI when invalidating TLBs. | ||
696 | * | ||
697 | * Another way to achieve this is to batch up page table containing pages | ||
698 | * belonging to more than one mm_user, then rcu_sched a callback to free those | ||
699 | * pages. Disabling interrupts will allow the fast_gup walker to both block | ||
700 | * the rcu_sched callback, and an IPI that we broadcast for splitting THPs | ||
701 | * (which is a relatively rare event). The code below adopts this strategy. | ||
702 | * | ||
703 | * Before activating this code, please be aware that the following assumptions | ||
704 | * are currently made: | ||
705 | * | ||
706 | * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free | ||
707 | * pages containing page tables. | ||
708 | * | ||
709 | * *) THP splits will broadcast an IPI, this can be achieved by overriding | ||
710 | * pmdp_splitting_flush. | ||
711 | * | ||
712 | * *) ptes can be read atomically by the architecture. | ||
713 | * | ||
714 | * *) access_ok is sufficient to validate userspace address ranges. | ||
715 | * | ||
716 | * The last two assumptions can be relaxed by the addition of helper functions. | ||
717 | * | ||
718 | * This code is based heavily on the PowerPC implementation by Nick Piggin. | ||
719 | */ | ||
720 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP | ||
721 | |||
722 | #ifdef __HAVE_ARCH_PTE_SPECIAL | ||
723 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | ||
724 | int write, struct page **pages, int *nr) | ||
725 | { | ||
726 | pte_t *ptep, *ptem; | ||
727 | int ret = 0; | ||
728 | |||
729 | ptem = ptep = pte_offset_map(&pmd, addr); | ||
730 | do { | ||
731 | /* | ||
732 | * In the line below we are assuming that the pte can be read | ||
733 | * atomically. If this is not the case for your architecture, | ||
734 | * please wrap this in a helper function! | ||
735 | * | ||
736 | * for an example see gup_get_pte in arch/x86/mm/gup.c | ||
737 | */ | ||
738 | pte_t pte = ACCESS_ONCE(*ptep); | ||
739 | struct page *page; | ||
740 | |||
741 | /* | ||
742 | * Similar to the PMD case below, NUMA hinting must take slow | ||
743 | * path | ||
744 | */ | ||
745 | if (!pte_present(pte) || pte_special(pte) || | ||
746 | pte_numa(pte) || (write && !pte_write(pte))) | ||
747 | goto pte_unmap; | ||
748 | |||
749 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
750 | page = pte_page(pte); | ||
751 | |||
752 | if (!page_cache_get_speculative(page)) | ||
753 | goto pte_unmap; | ||
754 | |||
755 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
756 | put_page(page); | ||
757 | goto pte_unmap; | ||
758 | } | ||
759 | |||
760 | pages[*nr] = page; | ||
761 | (*nr)++; | ||
762 | |||
763 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
764 | |||
765 | ret = 1; | ||
766 | |||
767 | pte_unmap: | ||
768 | pte_unmap(ptem); | ||
769 | return ret; | ||
770 | } | ||
771 | #else | ||
772 | |||
773 | /* | ||
774 | * If we can't determine whether or not a pte is special, then fail immediately | ||
775 | * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not | ||
776 | * to be special. | ||
777 | * | ||
778 | * For a futex to be placed on a THP tail page, get_futex_key requires a | ||
779 | * __get_user_pages_fast implementation that can pin pages. Thus it's still | ||
780 | * useful to have gup_huge_pmd even if we can't operate on ptes. | ||
781 | */ | ||
782 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | ||
783 | int write, struct page **pages, int *nr) | ||
784 | { | ||
785 | return 0; | ||
786 | } | ||
787 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ | ||
788 | |||
789 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | ||
790 | unsigned long end, int write, struct page **pages, int *nr) | ||
791 | { | ||
792 | struct page *head, *page, *tail; | ||
793 | int refs; | ||
794 | |||
795 | if (write && !pmd_write(orig)) | ||
796 | return 0; | ||
797 | |||
798 | refs = 0; | ||
799 | head = pmd_page(orig); | ||
800 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
801 | tail = page; | ||
802 | do { | ||
803 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
804 | pages[*nr] = page; | ||
805 | (*nr)++; | ||
806 | page++; | ||
807 | refs++; | ||
808 | } while (addr += PAGE_SIZE, addr != end); | ||
809 | |||
810 | if (!page_cache_add_speculative(head, refs)) { | ||
811 | *nr -= refs; | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { | ||
816 | *nr -= refs; | ||
817 | while (refs--) | ||
818 | put_page(head); | ||
819 | return 0; | ||
820 | } | ||
821 | |||
822 | /* | ||
823 | * Any tail pages need their mapcount reference taken before we | ||
824 | * return. (This allows the THP code to bump their ref count when | ||
825 | * they are split into base pages). | ||
826 | */ | ||
827 | while (refs--) { | ||
828 | if (PageTail(tail)) | ||
829 | get_huge_page_tail(tail); | ||
830 | tail++; | ||
831 | } | ||
832 | |||
833 | return 1; | ||
834 | } | ||
835 | |||
836 | static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | ||
837 | unsigned long end, int write, struct page **pages, int *nr) | ||
838 | { | ||
839 | struct page *head, *page, *tail; | ||
840 | int refs; | ||
841 | |||
842 | if (write && !pud_write(orig)) | ||
843 | return 0; | ||
844 | |||
845 | refs = 0; | ||
846 | head = pud_page(orig); | ||
847 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
848 | tail = page; | ||
849 | do { | ||
850 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
851 | pages[*nr] = page; | ||
852 | (*nr)++; | ||
853 | page++; | ||
854 | refs++; | ||
855 | } while (addr += PAGE_SIZE, addr != end); | ||
856 | |||
857 | if (!page_cache_add_speculative(head, refs)) { | ||
858 | *nr -= refs; | ||
859 | return 0; | ||
860 | } | ||
861 | |||
862 | if (unlikely(pud_val(orig) != pud_val(*pudp))) { | ||
863 | *nr -= refs; | ||
864 | while (refs--) | ||
865 | put_page(head); | ||
866 | return 0; | ||
867 | } | ||
868 | |||
869 | while (refs--) { | ||
870 | if (PageTail(tail)) | ||
871 | get_huge_page_tail(tail); | ||
872 | tail++; | ||
873 | } | ||
874 | |||
875 | return 1; | ||
876 | } | ||
877 | |||
878 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
879 | int write, struct page **pages, int *nr) | ||
880 | { | ||
881 | unsigned long next; | ||
882 | pmd_t *pmdp; | ||
883 | |||
884 | pmdp = pmd_offset(&pud, addr); | ||
885 | do { | ||
886 | pmd_t pmd = ACCESS_ONCE(*pmdp); | ||
887 | |||
888 | next = pmd_addr_end(addr, end); | ||
889 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
890 | return 0; | ||
891 | |||
892 | if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { | ||
893 | /* | ||
894 | * NUMA hinting faults need to be handled in the GUP | ||
895 | * slowpath for accounting purposes and so that they | ||
896 | * can be serialised against THP migration. | ||
897 | */ | ||
898 | if (pmd_numa(pmd)) | ||
899 | return 0; | ||
900 | |||
901 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, | ||
902 | pages, nr)) | ||
903 | return 0; | ||
904 | |||
905 | } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
906 | return 0; | ||
907 | } while (pmdp++, addr = next, addr != end); | ||
908 | |||
909 | return 1; | ||
910 | } | ||
911 | |||
912 | static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end, | ||
913 | int write, struct page **pages, int *nr) | ||
914 | { | ||
915 | unsigned long next; | ||
916 | pud_t *pudp; | ||
917 | |||
918 | pudp = pud_offset(pgdp, addr); | ||
919 | do { | ||
920 | pud_t pud = ACCESS_ONCE(*pudp); | ||
921 | |||
922 | next = pud_addr_end(addr, end); | ||
923 | if (pud_none(pud)) | ||
924 | return 0; | ||
925 | if (pud_huge(pud)) { | ||
926 | if (!gup_huge_pud(pud, pudp, addr, next, write, | ||
927 | pages, nr)) | ||
928 | return 0; | ||
929 | } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
930 | return 0; | ||
931 | } while (pudp++, addr = next, addr != end); | ||
932 | |||
933 | return 1; | ||
934 | } | ||
935 | |||
936 | /* | ||
937 | * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to | ||
938 | * the regular GUP. It will only return non-negative values. | ||
939 | */ | ||
940 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
941 | struct page **pages) | ||
942 | { | ||
943 | struct mm_struct *mm = current->mm; | ||
944 | unsigned long addr, len, end; | ||
945 | unsigned long next, flags; | ||
946 | pgd_t *pgdp; | ||
947 | int nr = 0; | ||
948 | |||
949 | start &= PAGE_MASK; | ||
950 | addr = start; | ||
951 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
952 | end = start + len; | ||
953 | |||
954 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
955 | start, len))) | ||
956 | return 0; | ||
957 | |||
958 | /* | ||
959 | * Disable interrupts. We use the nested form as we can already have | ||
960 | * interrupts disabled by get_futex_key. | ||
961 | * | ||
962 | * With interrupts disabled, we block page table pages from being | ||
963 | * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h | ||
964 | * for more details. | ||
965 | * | ||
966 | * We do not adopt an rcu_read_lock(.) here as we also want to | ||
967 | * block IPIs that come from THPs splitting. | ||
968 | */ | ||
969 | |||
970 | local_irq_save(flags); | ||
971 | pgdp = pgd_offset(mm, addr); | ||
972 | do { | ||
973 | next = pgd_addr_end(addr, end); | ||
974 | if (pgd_none(*pgdp)) | ||
975 | break; | ||
976 | else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr)) | ||
977 | break; | ||
978 | } while (pgdp++, addr = next, addr != end); | ||
979 | local_irq_restore(flags); | ||
980 | |||
981 | return nr; | ||
982 | } | ||
983 | |||
984 | /** | ||
985 | * get_user_pages_fast() - pin user pages in memory | ||
986 | * @start: starting user address | ||
987 | * @nr_pages: number of pages from start to pin | ||
988 | * @write: whether pages will be written to | ||
989 | * @pages: array that receives pointers to the pages pinned. | ||
990 | * Should be at least nr_pages long. | ||
991 | * | ||
992 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
993 | * If not successful, it will fall back to taking the lock and | ||
994 | * calling get_user_pages(). | ||
995 | * | ||
996 | * Returns number of pages pinned. This may be fewer than the number | ||
997 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
998 | * were pinned, returns -errno. | ||
999 | */ | ||
1000 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
1001 | struct page **pages) | ||
1002 | { | ||
1003 | struct mm_struct *mm = current->mm; | ||
1004 | int nr, ret; | ||
1005 | |||
1006 | start &= PAGE_MASK; | ||
1007 | nr = __get_user_pages_fast(start, nr_pages, write, pages); | ||
1008 | ret = nr; | ||
1009 | |||
1010 | if (nr < nr_pages) { | ||
1011 | /* Try to get the remaining pages with get_user_pages */ | ||
1012 | start += nr << PAGE_SHIFT; | ||
1013 | pages += nr; | ||
1014 | |||
1015 | down_read(&mm->mmap_sem); | ||
1016 | ret = get_user_pages(current, mm, start, | ||
1017 | nr_pages - nr, write, 0, pages, NULL); | ||
1018 | up_read(&mm->mmap_sem); | ||
1019 | |||
1020 | /* Have to be a bit careful with return values */ | ||
1021 | if (nr > 0) { | ||
1022 | if (ret < 0) | ||
1023 | ret = nr; | ||
1024 | else | ||
1025 | ret += nr; | ||
1026 | } | ||
1027 | } | ||
1028 | |||
1029 | return ret; | ||
1030 | } | ||
1031 | |||
1032 | #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d9a21d06b862..74c78aa8bc2f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1096 | unsigned long mmun_end; /* For mmu_notifiers */ | 1096 | unsigned long mmun_end; /* For mmu_notifiers */ |
1097 | 1097 | ||
1098 | ptl = pmd_lockptr(mm, pmd); | 1098 | ptl = pmd_lockptr(mm, pmd); |
1099 | VM_BUG_ON(!vma->anon_vma); | 1099 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
1100 | haddr = address & HPAGE_PMD_MASK; | 1100 | haddr = address & HPAGE_PMD_MASK; |
1101 | if (is_huge_zero_pmd(orig_pmd)) | 1101 | if (is_huge_zero_pmd(orig_pmd)) |
1102 | goto alloc; | 1102 | goto alloc; |
@@ -1795,14 +1795,17 @@ static int __split_huge_page_map(struct page *page, | |||
1795 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1795 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1796 | pte_t *pte, entry; | 1796 | pte_t *pte, entry; |
1797 | BUG_ON(PageCompound(page+i)); | 1797 | BUG_ON(PageCompound(page+i)); |
1798 | /* | ||
1799 | * Note that pmd_numa is not transferred deliberately | ||
1800 | * to avoid any possibility that pte_numa leaks to | ||
1801 | * a PROT_NONE VMA by accident. | ||
1802 | */ | ||
1798 | entry = mk_pte(page + i, vma->vm_page_prot); | 1803 | entry = mk_pte(page + i, vma->vm_page_prot); |
1799 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1804 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1800 | if (!pmd_write(*pmd)) | 1805 | if (!pmd_write(*pmd)) |
1801 | entry = pte_wrprotect(entry); | 1806 | entry = pte_wrprotect(entry); |
1802 | if (!pmd_young(*pmd)) | 1807 | if (!pmd_young(*pmd)) |
1803 | entry = pte_mkold(entry); | 1808 | entry = pte_mkold(entry); |
1804 | if (pmd_numa(*pmd)) | ||
1805 | entry = pte_mknuma(entry); | ||
1806 | pte = pte_offset_map(&_pmd, haddr); | 1809 | pte = pte_offset_map(&_pmd, haddr); |
1807 | BUG_ON(!pte_none(*pte)); | 1810 | BUG_ON(!pte_none(*pte)); |
1808 | set_pte_at(mm, haddr, pte, entry); | 1811 | set_pte_at(mm, haddr, pte, entry); |
@@ -2045,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm) | |||
2045 | return -ENOMEM; | 2048 | return -ENOMEM; |
2046 | 2049 | ||
2047 | /* __khugepaged_exit() must not run from under us */ | 2050 | /* __khugepaged_exit() must not run from under us */ |
2048 | VM_BUG_ON(khugepaged_test_exit(mm)); | 2051 | VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); |
2049 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | 2052 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { |
2050 | free_mm_slot(mm_slot); | 2053 | free_mm_slot(mm_slot); |
2051 | return 0; | 2054 | return 0; |
@@ -2080,7 +2083,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | |||
2080 | if (vma->vm_ops) | 2083 | if (vma->vm_ops) |
2081 | /* khugepaged not yet working on file or special mappings */ | 2084 | /* khugepaged not yet working on file or special mappings */ |
2082 | return 0; | 2085 | return 0; |
2083 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2086 | VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); |
2084 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2087 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2085 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2088 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2086 | if (hstart < hend) | 2089 | if (hstart < hend) |
@@ -2319,23 +2322,17 @@ static struct page | |||
2319 | int node) | 2322 | int node) |
2320 | { | 2323 | { |
2321 | VM_BUG_ON_PAGE(*hpage, *hpage); | 2324 | VM_BUG_ON_PAGE(*hpage, *hpage); |
2325 | |||
2322 | /* | 2326 | /* |
2323 | * Allocate the page while the vma is still valid and under | 2327 | * Before allocating the hugepage, release the mmap_sem read lock. |
2324 | * the mmap_sem read mode so there is no memory allocation | 2328 | * The allocation can take potentially a long time if it involves |
2325 | * later when we take the mmap_sem in write mode. This is more | 2329 | * sync compaction, and we do not need to hold the mmap_sem during |
2326 | * friendly behavior (OTOH it may actually hide bugs) to | 2330 | * that. We will recheck the vma after taking it again in write mode. |
2327 | * filesystems in userland with daemons allocating memory in | ||
2328 | * the userland I/O paths. Allocating memory with the | ||
2329 | * mmap_sem in read mode is good idea also to allow greater | ||
2330 | * scalability. | ||
2331 | */ | 2331 | */ |
2332 | up_read(&mm->mmap_sem); | ||
2333 | |||
2332 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( | 2334 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( |
2333 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); | 2335 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); |
2334 | /* | ||
2335 | * After allocating the hugepage, release the mmap_sem read lock in | ||
2336 | * preparation for taking it in write mode. | ||
2337 | */ | ||
2338 | up_read(&mm->mmap_sem); | ||
2339 | if (unlikely(!*hpage)) { | 2336 | if (unlikely(!*hpage)) { |
2340 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2337 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
2341 | *hpage = ERR_PTR(-ENOMEM); | 2338 | *hpage = ERR_PTR(-ENOMEM); |
@@ -2409,7 +2406,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) | |||
2409 | return false; | 2406 | return false; |
2410 | if (is_vma_temporary_stack(vma)) | 2407 | if (is_vma_temporary_stack(vma)) |
2411 | return false; | 2408 | return false; |
2412 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2409 | VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); |
2413 | return true; | 2410 | return true; |
2414 | } | 2411 | } |
2415 | 2412 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eeceeeb09019..9fd722769927 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode) | |||
434 | 434 | ||
435 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | 435 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) |
436 | { | 436 | { |
437 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 437 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
438 | if (vma->vm_flags & VM_MAYSHARE) { | 438 | if (vma->vm_flags & VM_MAYSHARE) { |
439 | struct address_space *mapping = vma->vm_file->f_mapping; | 439 | struct address_space *mapping = vma->vm_file->f_mapping; |
440 | struct inode *inode = mapping->host; | 440 | struct inode *inode = mapping->host; |
@@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | |||
449 | 449 | ||
450 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | 450 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) |
451 | { | 451 | { |
452 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 452 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
453 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); | 453 | VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); |
454 | 454 | ||
455 | set_vma_private_data(vma, (get_vma_private_data(vma) & | 455 | set_vma_private_data(vma, (get_vma_private_data(vma) & |
456 | HPAGE_RESV_MASK) | (unsigned long)map); | 456 | HPAGE_RESV_MASK) | (unsigned long)map); |
@@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | |||
458 | 458 | ||
459 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) | 459 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) |
460 | { | 460 | { |
461 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 461 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
462 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); | 462 | VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); |
463 | 463 | ||
464 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); | 464 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); |
465 | } | 465 | } |
466 | 466 | ||
467 | static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | 467 | static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) |
468 | { | 468 | { |
469 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 469 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
470 | 470 | ||
471 | return (get_vma_private_data(vma) & flag) != 0; | 471 | return (get_vma_private_data(vma) & flag) != 0; |
472 | } | 472 | } |
@@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | |||
474 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ | 474 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ |
475 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 475 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
476 | { | 476 | { |
477 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 477 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
478 | if (!(vma->vm_flags & VM_MAYSHARE)) | 478 | if (!(vma->vm_flags & VM_MAYSHARE)) |
479 | vma->vm_private_data = (void *)0; | 479 | vma->vm_private_data = (void *)0; |
480 | } | 480 | } |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9eebfadeeee1..a67c26e0f360 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -217,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | |||
217 | 217 | ||
218 | if (hugetlb_cgroup_disabled()) | 218 | if (hugetlb_cgroup_disabled()) |
219 | return; | 219 | return; |
220 | VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); | 220 | lockdep_assert_held(&hugetlb_lock); |
221 | h_cg = hugetlb_cgroup_from_page(page); | 221 | h_cg = hugetlb_cgroup_from_page(page); |
222 | if (unlikely(!h_cg)) | 222 | if (unlikely(!h_cg)) |
223 | return; | 223 | return; |
diff --git a/mm/internal.h b/mm/internal.h index a1b651b11c5f..829304090b90 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -142,10 +142,10 @@ struct compact_control { | |||
142 | bool finished_update_migrate; | 142 | bool finished_update_migrate; |
143 | 143 | ||
144 | int order; /* order a direct compactor needs */ | 144 | int order; /* order a direct compactor needs */ |
145 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 145 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ |
146 | struct zone *zone; | 146 | struct zone *zone; |
147 | bool contended; /* True if a lock was contended, or | 147 | int contended; /* Signal need_sched() or lock |
148 | * need_resched() true during async | 148 | * contention detected during |
149 | * compaction | 149 | * compaction |
150 | */ | 150 | */ |
151 | }; | 151 | }; |
@@ -154,8 +154,8 @@ unsigned long | |||
154 | isolate_freepages_range(struct compact_control *cc, | 154 | isolate_freepages_range(struct compact_control *cc, |
155 | unsigned long start_pfn, unsigned long end_pfn); | 155 | unsigned long start_pfn, unsigned long end_pfn); |
156 | unsigned long | 156 | unsigned long |
157 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 157 | isolate_migratepages_range(struct compact_control *cc, |
158 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable); | 158 | unsigned long low_pfn, unsigned long end_pfn); |
159 | 159 | ||
160 | #endif | 160 | #endif |
161 | 161 | ||
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
164 | * general, page_zone(page)->lock must be held by the caller to prevent the | 164 | * general, page_zone(page)->lock must be held by the caller to prevent the |
165 | * page from being allocated in parallel and returning garbage as the order. | 165 | * page from being allocated in parallel and returning garbage as the order. |
166 | * If a caller does not hold page_zone(page)->lock, it must guarantee that the | 166 | * If a caller does not hold page_zone(page)->lock, it must guarantee that the |
167 | * page cannot be allocated or merged in parallel. | 167 | * page cannot be allocated or merged in parallel. Alternatively, it must |
168 | * handle invalid values gracefully, and use page_order_unsafe() below. | ||
168 | */ | 169 | */ |
169 | static inline unsigned long page_order(struct page *page) | 170 | static inline unsigned long page_order(struct page *page) |
170 | { | 171 | { |
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page) | |||
172 | return page_private(page); | 173 | return page_private(page); |
173 | } | 174 | } |
174 | 175 | ||
176 | /* | ||
177 | * Like page_order(), but for callers who cannot afford to hold the zone lock. | ||
178 | * PageBuddy() should be checked first by the caller to minimize race window, | ||
179 | * and invalid values must be handled gracefully. | ||
180 | * | ||
181 | * ACCESS_ONCE is used so that if the caller assigns the result into a local | ||
182 | * variable and e.g. tests it for valid range before using, the compiler cannot | ||
183 | * decide to remove the variable and inline the page_private(page) multiple | ||
184 | * times, potentially observing different values in the tests and the actual | ||
185 | * use of the result. | ||
186 | */ | ||
187 | #define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) | ||
188 | |||
175 | static inline bool is_cow_mapping(vm_flags_t flags) | 189 | static inline bool is_cow_mapping(vm_flags_t flags) |
176 | { | 190 | { |
177 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 191 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 4a5822a586e6..8da581fa9060 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c | |||
@@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, | |||
34 | struct vm_area_struct *parent; | 34 | struct vm_area_struct *parent; |
35 | unsigned long last = vma_last_pgoff(node); | 35 | unsigned long last = vma_last_pgoff(node); |
36 | 36 | ||
37 | VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); | 37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); |
38 | 38 | ||
39 | if (!prev->shared.linear.rb.rb_right) { | 39 | if (!prev->shared.linear.rb.rb_right) { |
40 | parent = prev; | 40 | parent = prev; |
diff --git a/mm/iov_iter.c b/mm/iov_iter.c index ab88dc0ea1d3..9a09f2034fcc 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c | |||
@@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction, | |||
310 | EXPORT_SYMBOL(iov_iter_init); | 310 | EXPORT_SYMBOL(iov_iter_init); |
311 | 311 | ||
312 | static ssize_t get_pages_iovec(struct iov_iter *i, | 312 | static ssize_t get_pages_iovec(struct iov_iter *i, |
313 | struct page **pages, unsigned maxpages, | 313 | struct page **pages, size_t maxsize, unsigned maxpages, |
314 | size_t *start) | 314 | size_t *start) |
315 | { | 315 | { |
316 | size_t offset = i->iov_offset; | 316 | size_t offset = i->iov_offset; |
@@ -323,6 +323,8 @@ static ssize_t get_pages_iovec(struct iov_iter *i, | |||
323 | len = iov->iov_len - offset; | 323 | len = iov->iov_len - offset; |
324 | if (len > i->count) | 324 | if (len > i->count) |
325 | len = i->count; | 325 | len = i->count; |
326 | if (len > maxsize) | ||
327 | len = maxsize; | ||
326 | addr = (unsigned long)iov->iov_base + offset; | 328 | addr = (unsigned long)iov->iov_base + offset; |
327 | len += *start = addr & (PAGE_SIZE - 1); | 329 | len += *start = addr & (PAGE_SIZE - 1); |
328 | if (len > maxpages * PAGE_SIZE) | 330 | if (len > maxpages * PAGE_SIZE) |
@@ -588,13 +590,15 @@ static unsigned long alignment_bvec(const struct iov_iter *i) | |||
588 | } | 590 | } |
589 | 591 | ||
590 | static ssize_t get_pages_bvec(struct iov_iter *i, | 592 | static ssize_t get_pages_bvec(struct iov_iter *i, |
591 | struct page **pages, unsigned maxpages, | 593 | struct page **pages, size_t maxsize, unsigned maxpages, |
592 | size_t *start) | 594 | size_t *start) |
593 | { | 595 | { |
594 | const struct bio_vec *bvec = i->bvec; | 596 | const struct bio_vec *bvec = i->bvec; |
595 | size_t len = bvec->bv_len - i->iov_offset; | 597 | size_t len = bvec->bv_len - i->iov_offset; |
596 | if (len > i->count) | 598 | if (len > i->count) |
597 | len = i->count; | 599 | len = i->count; |
600 | if (len > maxsize) | ||
601 | len = maxsize; | ||
598 | /* can't be more than PAGE_SIZE */ | 602 | /* can't be more than PAGE_SIZE */ |
599 | *start = bvec->bv_offset + i->iov_offset; | 603 | *start = bvec->bv_offset + i->iov_offset; |
600 | 604 | ||
@@ -711,13 +715,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) | |||
711 | EXPORT_SYMBOL(iov_iter_alignment); | 715 | EXPORT_SYMBOL(iov_iter_alignment); |
712 | 716 | ||
713 | ssize_t iov_iter_get_pages(struct iov_iter *i, | 717 | ssize_t iov_iter_get_pages(struct iov_iter *i, |
714 | struct page **pages, unsigned maxpages, | 718 | struct page **pages, size_t maxsize, unsigned maxpages, |
715 | size_t *start) | 719 | size_t *start) |
716 | { | 720 | { |
717 | if (i->type & ITER_BVEC) | 721 | if (i->type & ITER_BVEC) |
718 | return get_pages_bvec(i, pages, maxpages, start); | 722 | return get_pages_bvec(i, pages, maxsize, maxpages, start); |
719 | else | 723 | else |
720 | return get_pages_iovec(i, pages, maxpages, start); | 724 | return get_pages_iovec(i, pages, maxsize, maxpages, start); |
721 | } | 725 | } |
722 | EXPORT_SYMBOL(iov_iter_get_pages); | 726 | EXPORT_SYMBOL(iov_iter_get_pages); |
723 | 727 | ||
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index fd814fd61319..cab58bb592d8 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/mm_types.h> | 2 | #include <linux/mm_types.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
5 | #include "slab.h" | ||
5 | #include <linux/kmemcheck.h> | 6 | #include <linux/kmemcheck.h> |
6 | 7 | ||
7 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | 8 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) |
@@ -2310,7 +2310,7 @@ static int __init ksm_init(void) | |||
2310 | 2310 | ||
2311 | ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); | 2311 | ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); |
2312 | if (IS_ERR(ksm_thread)) { | 2312 | if (IS_ERR(ksm_thread)) { |
2313 | printk(KERN_ERR "ksm: creating kthread failed\n"); | 2313 | pr_err("ksm: creating kthread failed\n"); |
2314 | err = PTR_ERR(ksm_thread); | 2314 | err = PTR_ERR(ksm_thread); |
2315 | goto out_free; | 2315 | goto out_free; |
2316 | } | 2316 | } |
@@ -2318,7 +2318,7 @@ static int __init ksm_init(void) | |||
2318 | #ifdef CONFIG_SYSFS | 2318 | #ifdef CONFIG_SYSFS |
2319 | err = sysfs_create_group(mm_kobj, &ksm_attr_group); | 2319 | err = sysfs_create_group(mm_kobj, &ksm_attr_group); |
2320 | if (err) { | 2320 | if (err) { |
2321 | printk(KERN_ERR "ksm: register sysfs failed\n"); | 2321 | pr_err("ksm: register sysfs failed\n"); |
2322 | kthread_stop(ksm_thread); | 2322 | kthread_stop(ksm_thread); |
2323 | goto out_free; | 2323 | goto out_free; |
2324 | } | 2324 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index 6d2f219a48b0..6ecb0d937fb5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -192,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, | |||
192 | phys_addr_t align, phys_addr_t start, | 192 | phys_addr_t align, phys_addr_t start, |
193 | phys_addr_t end, int nid) | 193 | phys_addr_t end, int nid) |
194 | { | 194 | { |
195 | int ret; | 195 | phys_addr_t kernel_end, ret; |
196 | phys_addr_t kernel_end; | ||
197 | 196 | ||
198 | /* pump up @end */ | 197 | /* pump up @end */ |
199 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | 198 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) |
@@ -817,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, | |||
817 | if (nid != NUMA_NO_NODE && nid != m_nid) | 816 | if (nid != NUMA_NO_NODE && nid != m_nid) |
818 | continue; | 817 | continue; |
819 | 818 | ||
819 | /* skip hotpluggable memory regions if needed */ | ||
820 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | ||
821 | continue; | ||
822 | |||
820 | if (!type_b) { | 823 | if (!type_b) { |
821 | if (out_start) | 824 | if (out_start) |
822 | *out_start = m_start; | 825 | *out_start = m_start; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ec4dcf1b9562..23976fd885fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -292,6 +292,9 @@ struct mem_cgroup { | |||
292 | /* vmpressure notifications */ | 292 | /* vmpressure notifications */ |
293 | struct vmpressure vmpressure; | 293 | struct vmpressure vmpressure; |
294 | 294 | ||
295 | /* css_online() has been completed */ | ||
296 | int initialized; | ||
297 | |||
295 | /* | 298 | /* |
296 | * the counter to account for mem+swap usage. | 299 | * the counter to account for mem+swap usage. |
297 | */ | 300 | */ |
@@ -315,9 +318,6 @@ struct mem_cgroup { | |||
315 | /* OOM-Killer disable */ | 318 | /* OOM-Killer disable */ |
316 | int oom_kill_disable; | 319 | int oom_kill_disable; |
317 | 320 | ||
318 | /* set when res.limit == memsw.limit */ | ||
319 | bool memsw_is_minimum; | ||
320 | |||
321 | /* protect arrays of thresholds */ | 321 | /* protect arrays of thresholds */ |
322 | struct mutex thresholds_lock; | 322 | struct mutex thresholds_lock; |
323 | 323 | ||
@@ -481,14 +481,6 @@ enum res_type { | |||
481 | #define OOM_CONTROL (0) | 481 | #define OOM_CONTROL (0) |
482 | 482 | ||
483 | /* | 483 | /* |
484 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | ||
485 | */ | ||
486 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | ||
487 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | ||
488 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | ||
489 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | ||
490 | |||
491 | /* | ||
492 | * The memcg_create_mutex will be held whenever a new cgroup is created. | 484 | * The memcg_create_mutex will be held whenever a new cgroup is created. |
493 | * As a consequence, any change that needs to protect against new child cgroups | 485 | * As a consequence, any change that needs to protect against new child cgroups |
494 | * appearing has to hold it as well. | 486 | * appearing has to hold it as well. |
@@ -646,11 +638,13 @@ int memcg_limited_groups_array_size; | |||
646 | struct static_key memcg_kmem_enabled_key; | 638 | struct static_key memcg_kmem_enabled_key; |
647 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 639 | EXPORT_SYMBOL(memcg_kmem_enabled_key); |
648 | 640 | ||
641 | static void memcg_free_cache_id(int id); | ||
642 | |||
649 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 643 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
650 | { | 644 | { |
651 | if (memcg_kmem_is_active(memcg)) { | 645 | if (memcg_kmem_is_active(memcg)) { |
652 | static_key_slow_dec(&memcg_kmem_enabled_key); | 646 | static_key_slow_dec(&memcg_kmem_enabled_key); |
653 | ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); | 647 | memcg_free_cache_id(memcg->kmemcg_id); |
654 | } | 648 | } |
655 | /* | 649 | /* |
656 | * This check can't live in kmem destruction function, | 650 | * This check can't live in kmem destruction function, |
@@ -1099,10 +1093,21 @@ skip_node: | |||
1099 | * skipping css reference should be safe. | 1093 | * skipping css reference should be safe. |
1100 | */ | 1094 | */ |
1101 | if (next_css) { | 1095 | if (next_css) { |
1102 | if ((next_css == &root->css) || | 1096 | struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); |
1103 | ((next_css->flags & CSS_ONLINE) && | 1097 | |
1104 | css_tryget_online(next_css))) | 1098 | if (next_css == &root->css) |
1105 | return mem_cgroup_from_css(next_css); | 1099 | return memcg; |
1100 | |||
1101 | if (css_tryget_online(next_css)) { | ||
1102 | /* | ||
1103 | * Make sure the memcg is initialized: | ||
1104 | * mem_cgroup_css_online() orders the the | ||
1105 | * initialization against setting the flag. | ||
1106 | */ | ||
1107 | if (smp_load_acquire(&memcg->initialized)) | ||
1108 | return memcg; | ||
1109 | css_put(next_css); | ||
1110 | } | ||
1106 | 1111 | ||
1107 | prev_css = next_css; | 1112 | prev_css = next_css; |
1108 | goto skip_node; | 1113 | goto skip_node; |
@@ -1792,42 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1792 | NULL, "Memory cgroup out of memory"); | 1797 | NULL, "Memory cgroup out of memory"); |
1793 | } | 1798 | } |
1794 | 1799 | ||
1795 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | ||
1796 | gfp_t gfp_mask, | ||
1797 | unsigned long flags) | ||
1798 | { | ||
1799 | unsigned long total = 0; | ||
1800 | bool noswap = false; | ||
1801 | int loop; | ||
1802 | |||
1803 | if (flags & MEM_CGROUP_RECLAIM_NOSWAP) | ||
1804 | noswap = true; | ||
1805 | if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) | ||
1806 | noswap = true; | ||
1807 | |||
1808 | for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { | ||
1809 | if (loop) | ||
1810 | drain_all_stock_async(memcg); | ||
1811 | total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); | ||
1812 | /* | ||
1813 | * Allow limit shrinkers, which are triggered directly | ||
1814 | * by userspace, to catch signals and stop reclaim | ||
1815 | * after minimal progress, regardless of the margin. | ||
1816 | */ | ||
1817 | if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) | ||
1818 | break; | ||
1819 | if (mem_cgroup_margin(memcg)) | ||
1820 | break; | ||
1821 | /* | ||
1822 | * If nothing was reclaimed after two attempts, there | ||
1823 | * may be no reclaimable pages in this hierarchy. | ||
1824 | */ | ||
1825 | if (loop && !total) | ||
1826 | break; | ||
1827 | } | ||
1828 | return total; | ||
1829 | } | ||
1830 | |||
1831 | /** | 1800 | /** |
1832 | * test_mem_cgroup_node_reclaimable | 1801 | * test_mem_cgroup_node_reclaimable |
1833 | * @memcg: the target memcg | 1802 | * @memcg: the target memcg |
@@ -2530,25 +2499,29 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2530 | struct mem_cgroup *mem_over_limit; | 2499 | struct mem_cgroup *mem_over_limit; |
2531 | struct res_counter *fail_res; | 2500 | struct res_counter *fail_res; |
2532 | unsigned long nr_reclaimed; | 2501 | unsigned long nr_reclaimed; |
2533 | unsigned long flags = 0; | ||
2534 | unsigned long long size; | 2502 | unsigned long long size; |
2503 | bool may_swap = true; | ||
2504 | bool drained = false; | ||
2535 | int ret = 0; | 2505 | int ret = 0; |
2536 | 2506 | ||
2507 | if (mem_cgroup_is_root(memcg)) | ||
2508 | goto done; | ||
2537 | retry: | 2509 | retry: |
2538 | if (consume_stock(memcg, nr_pages)) | 2510 | if (consume_stock(memcg, nr_pages)) |
2539 | goto done; | 2511 | goto done; |
2540 | 2512 | ||
2541 | size = batch * PAGE_SIZE; | 2513 | size = batch * PAGE_SIZE; |
2542 | if (!res_counter_charge(&memcg->res, size, &fail_res)) { | 2514 | if (!do_swap_account || |
2543 | if (!do_swap_account) | 2515 | !res_counter_charge(&memcg->memsw, size, &fail_res)) { |
2544 | goto done_restock; | 2516 | if (!res_counter_charge(&memcg->res, size, &fail_res)) |
2545 | if (!res_counter_charge(&memcg->memsw, size, &fail_res)) | ||
2546 | goto done_restock; | 2517 | goto done_restock; |
2547 | res_counter_uncharge(&memcg->res, size); | 2518 | if (do_swap_account) |
2548 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 2519 | res_counter_uncharge(&memcg->memsw, size); |
2549 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
2550 | } else | ||
2551 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 2520 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
2521 | } else { | ||
2522 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | ||
2523 | may_swap = false; | ||
2524 | } | ||
2552 | 2525 | ||
2553 | if (batch > nr_pages) { | 2526 | if (batch > nr_pages) { |
2554 | batch = nr_pages; | 2527 | batch = nr_pages; |
@@ -2572,11 +2545,18 @@ retry: | |||
2572 | if (!(gfp_mask & __GFP_WAIT)) | 2545 | if (!(gfp_mask & __GFP_WAIT)) |
2573 | goto nomem; | 2546 | goto nomem; |
2574 | 2547 | ||
2575 | nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); | 2548 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, |
2549 | gfp_mask, may_swap); | ||
2576 | 2550 | ||
2577 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2551 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2578 | goto retry; | 2552 | goto retry; |
2579 | 2553 | ||
2554 | if (!drained) { | ||
2555 | drain_all_stock_async(mem_over_limit); | ||
2556 | drained = true; | ||
2557 | goto retry; | ||
2558 | } | ||
2559 | |||
2580 | if (gfp_mask & __GFP_NORETRY) | 2560 | if (gfp_mask & __GFP_NORETRY) |
2581 | goto nomem; | 2561 | goto nomem; |
2582 | /* | 2562 | /* |
@@ -2611,9 +2591,7 @@ nomem: | |||
2611 | if (!(gfp_mask & __GFP_NOFAIL)) | 2591 | if (!(gfp_mask & __GFP_NOFAIL)) |
2612 | return -ENOMEM; | 2592 | return -ENOMEM; |
2613 | bypass: | 2593 | bypass: |
2614 | memcg = root_mem_cgroup; | 2594 | return -EINTR; |
2615 | ret = -EINTR; | ||
2616 | goto retry; | ||
2617 | 2595 | ||
2618 | done_restock: | 2596 | done_restock: |
2619 | if (batch > nr_pages) | 2597 | if (batch > nr_pages) |
@@ -2626,6 +2604,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
2626 | { | 2604 | { |
2627 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2605 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2628 | 2606 | ||
2607 | if (mem_cgroup_is_root(memcg)) | ||
2608 | return; | ||
2609 | |||
2629 | res_counter_uncharge(&memcg->res, bytes); | 2610 | res_counter_uncharge(&memcg->res, bytes); |
2630 | if (do_swap_account) | 2611 | if (do_swap_account) |
2631 | res_counter_uncharge(&memcg->memsw, bytes); | 2612 | res_counter_uncharge(&memcg->memsw, bytes); |
@@ -2640,6 +2621,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | |||
2640 | { | 2621 | { |
2641 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2622 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2642 | 2623 | ||
2624 | if (mem_cgroup_is_root(memcg)) | ||
2625 | return; | ||
2626 | |||
2643 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); | 2627 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); |
2644 | if (do_swap_account) | 2628 | if (do_swap_account) |
2645 | res_counter_uncharge_until(&memcg->memsw, | 2629 | res_counter_uncharge_until(&memcg->memsw, |
@@ -2778,12 +2762,6 @@ static DEFINE_MUTEX(memcg_slab_mutex); | |||
2778 | 2762 | ||
2779 | static DEFINE_MUTEX(activate_kmem_mutex); | 2763 | static DEFINE_MUTEX(activate_kmem_mutex); |
2780 | 2764 | ||
2781 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | ||
2782 | { | ||
2783 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | ||
2784 | memcg_kmem_is_active(memcg); | ||
2785 | } | ||
2786 | |||
2787 | /* | 2765 | /* |
2788 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | 2766 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer |
2789 | * in the memcg_cache_params struct. | 2767 | * in the memcg_cache_params struct. |
@@ -2803,7 +2781,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | |||
2803 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 2781 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
2804 | struct memcg_cache_params *params; | 2782 | struct memcg_cache_params *params; |
2805 | 2783 | ||
2806 | if (!memcg_can_account_kmem(memcg)) | 2784 | if (!memcg_kmem_is_active(memcg)) |
2807 | return -EIO; | 2785 | return -EIO; |
2808 | 2786 | ||
2809 | print_slabinfo_header(m); | 2787 | print_slabinfo_header(m); |
@@ -2886,19 +2864,44 @@ int memcg_cache_id(struct mem_cgroup *memcg) | |||
2886 | return memcg ? memcg->kmemcg_id : -1; | 2864 | return memcg ? memcg->kmemcg_id : -1; |
2887 | } | 2865 | } |
2888 | 2866 | ||
2889 | static size_t memcg_caches_array_size(int num_groups) | 2867 | static int memcg_alloc_cache_id(void) |
2890 | { | 2868 | { |
2891 | ssize_t size; | 2869 | int id, size; |
2892 | if (num_groups <= 0) | 2870 | int err; |
2893 | return 0; | 2871 | |
2872 | id = ida_simple_get(&kmem_limited_groups, | ||
2873 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
2874 | if (id < 0) | ||
2875 | return id; | ||
2894 | 2876 | ||
2895 | size = 2 * num_groups; | 2877 | if (id < memcg_limited_groups_array_size) |
2878 | return id; | ||
2879 | |||
2880 | /* | ||
2881 | * There's no space for the new id in memcg_caches arrays, | ||
2882 | * so we have to grow them. | ||
2883 | */ | ||
2884 | |||
2885 | size = 2 * (id + 1); | ||
2896 | if (size < MEMCG_CACHES_MIN_SIZE) | 2886 | if (size < MEMCG_CACHES_MIN_SIZE) |
2897 | size = MEMCG_CACHES_MIN_SIZE; | 2887 | size = MEMCG_CACHES_MIN_SIZE; |
2898 | else if (size > MEMCG_CACHES_MAX_SIZE) | 2888 | else if (size > MEMCG_CACHES_MAX_SIZE) |
2899 | size = MEMCG_CACHES_MAX_SIZE; | 2889 | size = MEMCG_CACHES_MAX_SIZE; |
2900 | 2890 | ||
2901 | return size; | 2891 | mutex_lock(&memcg_slab_mutex); |
2892 | err = memcg_update_all_caches(size); | ||
2893 | mutex_unlock(&memcg_slab_mutex); | ||
2894 | |||
2895 | if (err) { | ||
2896 | ida_simple_remove(&kmem_limited_groups, id); | ||
2897 | return err; | ||
2898 | } | ||
2899 | return id; | ||
2900 | } | ||
2901 | |||
2902 | static void memcg_free_cache_id(int id) | ||
2903 | { | ||
2904 | ida_simple_remove(&kmem_limited_groups, id); | ||
2902 | } | 2905 | } |
2903 | 2906 | ||
2904 | /* | 2907 | /* |
@@ -2908,97 +2911,7 @@ static size_t memcg_caches_array_size(int num_groups) | |||
2908 | */ | 2911 | */ |
2909 | void memcg_update_array_size(int num) | 2912 | void memcg_update_array_size(int num) |
2910 | { | 2913 | { |
2911 | if (num > memcg_limited_groups_array_size) | 2914 | memcg_limited_groups_array_size = num; |
2912 | memcg_limited_groups_array_size = memcg_caches_array_size(num); | ||
2913 | } | ||
2914 | |||
2915 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | ||
2916 | { | ||
2917 | struct memcg_cache_params *cur_params = s->memcg_params; | ||
2918 | |||
2919 | VM_BUG_ON(!is_root_cache(s)); | ||
2920 | |||
2921 | if (num_groups > memcg_limited_groups_array_size) { | ||
2922 | int i; | ||
2923 | struct memcg_cache_params *new_params; | ||
2924 | ssize_t size = memcg_caches_array_size(num_groups); | ||
2925 | |||
2926 | size *= sizeof(void *); | ||
2927 | size += offsetof(struct memcg_cache_params, memcg_caches); | ||
2928 | |||
2929 | new_params = kzalloc(size, GFP_KERNEL); | ||
2930 | if (!new_params) | ||
2931 | return -ENOMEM; | ||
2932 | |||
2933 | new_params->is_root_cache = true; | ||
2934 | |||
2935 | /* | ||
2936 | * There is the chance it will be bigger than | ||
2937 | * memcg_limited_groups_array_size, if we failed an allocation | ||
2938 | * in a cache, in which case all caches updated before it, will | ||
2939 | * have a bigger array. | ||
2940 | * | ||
2941 | * But if that is the case, the data after | ||
2942 | * memcg_limited_groups_array_size is certainly unused | ||
2943 | */ | ||
2944 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | ||
2945 | if (!cur_params->memcg_caches[i]) | ||
2946 | continue; | ||
2947 | new_params->memcg_caches[i] = | ||
2948 | cur_params->memcg_caches[i]; | ||
2949 | } | ||
2950 | |||
2951 | /* | ||
2952 | * Ideally, we would wait until all caches succeed, and only | ||
2953 | * then free the old one. But this is not worth the extra | ||
2954 | * pointer per-cache we'd have to have for this. | ||
2955 | * | ||
2956 | * It is not a big deal if some caches are left with a size | ||
2957 | * bigger than the others. And all updates will reset this | ||
2958 | * anyway. | ||
2959 | */ | ||
2960 | rcu_assign_pointer(s->memcg_params, new_params); | ||
2961 | if (cur_params) | ||
2962 | kfree_rcu(cur_params, rcu_head); | ||
2963 | } | ||
2964 | return 0; | ||
2965 | } | ||
2966 | |||
2967 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | ||
2968 | struct kmem_cache *root_cache) | ||
2969 | { | ||
2970 | size_t size; | ||
2971 | |||
2972 | if (!memcg_kmem_enabled()) | ||
2973 | return 0; | ||
2974 | |||
2975 | if (!memcg) { | ||
2976 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
2977 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
2978 | } else | ||
2979 | size = sizeof(struct memcg_cache_params); | ||
2980 | |||
2981 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
2982 | if (!s->memcg_params) | ||
2983 | return -ENOMEM; | ||
2984 | |||
2985 | if (memcg) { | ||
2986 | s->memcg_params->memcg = memcg; | ||
2987 | s->memcg_params->root_cache = root_cache; | ||
2988 | css_get(&memcg->css); | ||
2989 | } else | ||
2990 | s->memcg_params->is_root_cache = true; | ||
2991 | |||
2992 | return 0; | ||
2993 | } | ||
2994 | |||
2995 | void memcg_free_cache_params(struct kmem_cache *s) | ||
2996 | { | ||
2997 | if (!s->memcg_params) | ||
2998 | return; | ||
2999 | if (!s->memcg_params->is_root_cache) | ||
3000 | css_put(&s->memcg_params->memcg->css); | ||
3001 | kfree(s->memcg_params); | ||
3002 | } | 2915 | } |
3003 | 2916 | ||
3004 | static void memcg_register_cache(struct mem_cgroup *memcg, | 2917 | static void memcg_register_cache(struct mem_cgroup *memcg, |
@@ -3031,6 +2944,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, | |||
3031 | if (!cachep) | 2944 | if (!cachep) |
3032 | return; | 2945 | return; |
3033 | 2946 | ||
2947 | css_get(&memcg->css); | ||
3034 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | 2948 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
3035 | 2949 | ||
3036 | /* | 2950 | /* |
@@ -3064,6 +2978,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) | |||
3064 | list_del(&cachep->memcg_params->list); | 2978 | list_del(&cachep->memcg_params->list); |
3065 | 2979 | ||
3066 | kmem_cache_destroy(cachep); | 2980 | kmem_cache_destroy(cachep); |
2981 | |||
2982 | /* drop the reference taken in memcg_register_cache */ | ||
2983 | css_put(&memcg->css); | ||
3067 | } | 2984 | } |
3068 | 2985 | ||
3069 | /* | 2986 | /* |
@@ -3241,7 +3158,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3241 | rcu_read_lock(); | 3158 | rcu_read_lock(); |
3242 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | 3159 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); |
3243 | 3160 | ||
3244 | if (!memcg_can_account_kmem(memcg)) | 3161 | if (!memcg_kmem_is_active(memcg)) |
3245 | goto out; | 3162 | goto out; |
3246 | 3163 | ||
3247 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 3164 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
@@ -3326,7 +3243,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3326 | 3243 | ||
3327 | memcg = get_mem_cgroup_from_mm(current->mm); | 3244 | memcg = get_mem_cgroup_from_mm(current->mm); |
3328 | 3245 | ||
3329 | if (!memcg_can_account_kmem(memcg)) { | 3246 | if (!memcg_kmem_is_active(memcg)) { |
3330 | css_put(&memcg->css); | 3247 | css_put(&memcg->css); |
3331 | return true; | 3248 | return true; |
3332 | } | 3249 | } |
@@ -3668,7 +3585,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3668 | unsigned long long val) | 3585 | unsigned long long val) |
3669 | { | 3586 | { |
3670 | int retry_count; | 3587 | int retry_count; |
3671 | u64 memswlimit, memlimit; | ||
3672 | int ret = 0; | 3588 | int ret = 0; |
3673 | int children = mem_cgroup_count_children(memcg); | 3589 | int children = mem_cgroup_count_children(memcg); |
3674 | u64 curusage, oldusage; | 3590 | u64 curusage, oldusage; |
@@ -3695,31 +3611,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3695 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3611 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3696 | */ | 3612 | */ |
3697 | mutex_lock(&set_limit_mutex); | 3613 | mutex_lock(&set_limit_mutex); |
3698 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3614 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { |
3699 | if (memswlimit < val) { | ||
3700 | ret = -EINVAL; | 3615 | ret = -EINVAL; |
3701 | mutex_unlock(&set_limit_mutex); | 3616 | mutex_unlock(&set_limit_mutex); |
3702 | break; | 3617 | break; |
3703 | } | 3618 | } |
3704 | 3619 | ||
3705 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3620 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) |
3706 | if (memlimit < val) | ||
3707 | enlarge = 1; | 3621 | enlarge = 1; |
3708 | 3622 | ||
3709 | ret = res_counter_set_limit(&memcg->res, val); | 3623 | ret = res_counter_set_limit(&memcg->res, val); |
3710 | if (!ret) { | ||
3711 | if (memswlimit == val) | ||
3712 | memcg->memsw_is_minimum = true; | ||
3713 | else | ||
3714 | memcg->memsw_is_minimum = false; | ||
3715 | } | ||
3716 | mutex_unlock(&set_limit_mutex); | 3624 | mutex_unlock(&set_limit_mutex); |
3717 | 3625 | ||
3718 | if (!ret) | 3626 | if (!ret) |
3719 | break; | 3627 | break; |
3720 | 3628 | ||
3721 | mem_cgroup_reclaim(memcg, GFP_KERNEL, | 3629 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); |
3722 | MEM_CGROUP_RECLAIM_SHRINK); | 3630 | |
3723 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3631 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
3724 | /* Usage is reduced ? */ | 3632 | /* Usage is reduced ? */ |
3725 | if (curusage >= oldusage) | 3633 | if (curusage >= oldusage) |
@@ -3737,7 +3645,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3737 | unsigned long long val) | 3645 | unsigned long long val) |
3738 | { | 3646 | { |
3739 | int retry_count; | 3647 | int retry_count; |
3740 | u64 memlimit, memswlimit, oldusage, curusage; | 3648 | u64 oldusage, curusage; |
3741 | int children = mem_cgroup_count_children(memcg); | 3649 | int children = mem_cgroup_count_children(memcg); |
3742 | int ret = -EBUSY; | 3650 | int ret = -EBUSY; |
3743 | int enlarge = 0; | 3651 | int enlarge = 0; |
@@ -3756,30 +3664,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3756 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3664 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3757 | */ | 3665 | */ |
3758 | mutex_lock(&set_limit_mutex); | 3666 | mutex_lock(&set_limit_mutex); |
3759 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3667 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { |
3760 | if (memlimit > val) { | ||
3761 | ret = -EINVAL; | 3668 | ret = -EINVAL; |
3762 | mutex_unlock(&set_limit_mutex); | 3669 | mutex_unlock(&set_limit_mutex); |
3763 | break; | 3670 | break; |
3764 | } | 3671 | } |
3765 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3672 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) |
3766 | if (memswlimit < val) | ||
3767 | enlarge = 1; | 3673 | enlarge = 1; |
3768 | ret = res_counter_set_limit(&memcg->memsw, val); | 3674 | ret = res_counter_set_limit(&memcg->memsw, val); |
3769 | if (!ret) { | ||
3770 | if (memlimit == val) | ||
3771 | memcg->memsw_is_minimum = true; | ||
3772 | else | ||
3773 | memcg->memsw_is_minimum = false; | ||
3774 | } | ||
3775 | mutex_unlock(&set_limit_mutex); | 3675 | mutex_unlock(&set_limit_mutex); |
3776 | 3676 | ||
3777 | if (!ret) | 3677 | if (!ret) |
3778 | break; | 3678 | break; |
3779 | 3679 | ||
3780 | mem_cgroup_reclaim(memcg, GFP_KERNEL, | 3680 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); |
3781 | MEM_CGROUP_RECLAIM_NOSWAP | | 3681 | |
3782 | MEM_CGROUP_RECLAIM_SHRINK); | ||
3783 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3682 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3784 | /* Usage is reduced ? */ | 3683 | /* Usage is reduced ? */ |
3785 | if (curusage >= oldusage) | 3684 | if (curusage >= oldusage) |
@@ -4028,8 +3927,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | |||
4028 | if (signal_pending(current)) | 3927 | if (signal_pending(current)) |
4029 | return -EINTR; | 3928 | return -EINTR; |
4030 | 3929 | ||
4031 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, | 3930 | progress = try_to_free_mem_cgroup_pages(memcg, 1, |
4032 | false); | 3931 | GFP_KERNEL, true); |
4033 | if (!progress) { | 3932 | if (!progress) { |
4034 | nr_retries--; | 3933 | nr_retries--; |
4035 | /* maybe some writeback is necessary */ | 3934 | /* maybe some writeback is necessary */ |
@@ -4093,6 +3992,46 @@ out: | |||
4093 | return retval; | 3992 | return retval; |
4094 | } | 3993 | } |
4095 | 3994 | ||
3995 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, | ||
3996 | enum mem_cgroup_stat_index idx) | ||
3997 | { | ||
3998 | struct mem_cgroup *iter; | ||
3999 | long val = 0; | ||
4000 | |||
4001 | /* Per-cpu values can be negative, use a signed accumulator */ | ||
4002 | for_each_mem_cgroup_tree(iter, memcg) | ||
4003 | val += mem_cgroup_read_stat(iter, idx); | ||
4004 | |||
4005 | if (val < 0) /* race ? */ | ||
4006 | val = 0; | ||
4007 | return val; | ||
4008 | } | ||
4009 | |||
4010 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | ||
4011 | { | ||
4012 | u64 val; | ||
4013 | |||
4014 | if (!mem_cgroup_is_root(memcg)) { | ||
4015 | if (!swap) | ||
4016 | return res_counter_read_u64(&memcg->res, RES_USAGE); | ||
4017 | else | ||
4018 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
4019 | } | ||
4020 | |||
4021 | /* | ||
4022 | * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS | ||
4023 | * as well as in MEM_CGROUP_STAT_RSS_HUGE. | ||
4024 | */ | ||
4025 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
4026 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
4027 | |||
4028 | if (swap) | ||
4029 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); | ||
4030 | |||
4031 | return val << PAGE_SHIFT; | ||
4032 | } | ||
4033 | |||
4034 | |||
4096 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | 4035 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
4097 | struct cftype *cft) | 4036 | struct cftype *cft) |
4098 | { | 4037 | { |
@@ -4102,8 +4041,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | |||
4102 | 4041 | ||
4103 | switch (type) { | 4042 | switch (type) { |
4104 | case _MEM: | 4043 | case _MEM: |
4044 | if (name == RES_USAGE) | ||
4045 | return mem_cgroup_usage(memcg, false); | ||
4105 | return res_counter_read_u64(&memcg->res, name); | 4046 | return res_counter_read_u64(&memcg->res, name); |
4106 | case _MEMSWAP: | 4047 | case _MEMSWAP: |
4048 | if (name == RES_USAGE) | ||
4049 | return mem_cgroup_usage(memcg, true); | ||
4107 | return res_counter_read_u64(&memcg->memsw, name); | 4050 | return res_counter_read_u64(&memcg->memsw, name); |
4108 | case _KMEM: | 4051 | case _KMEM: |
4109 | return res_counter_read_u64(&memcg->kmem, name); | 4052 | return res_counter_read_u64(&memcg->kmem, name); |
@@ -4150,23 +4093,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
4150 | if (err) | 4093 | if (err) |
4151 | goto out; | 4094 | goto out; |
4152 | 4095 | ||
4153 | memcg_id = ida_simple_get(&kmem_limited_groups, | 4096 | memcg_id = memcg_alloc_cache_id(); |
4154 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
4155 | if (memcg_id < 0) { | 4097 | if (memcg_id < 0) { |
4156 | err = memcg_id; | 4098 | err = memcg_id; |
4157 | goto out; | 4099 | goto out; |
4158 | } | 4100 | } |
4159 | 4101 | ||
4160 | /* | ||
4161 | * Make sure we have enough space for this cgroup in each root cache's | ||
4162 | * memcg_params. | ||
4163 | */ | ||
4164 | mutex_lock(&memcg_slab_mutex); | ||
4165 | err = memcg_update_all_caches(memcg_id + 1); | ||
4166 | mutex_unlock(&memcg_slab_mutex); | ||
4167 | if (err) | ||
4168 | goto out_rmid; | ||
4169 | |||
4170 | memcg->kmemcg_id = memcg_id; | 4102 | memcg->kmemcg_id = memcg_id; |
4171 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | 4103 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); |
4172 | 4104 | ||
@@ -4187,10 +4119,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
4187 | out: | 4119 | out: |
4188 | memcg_resume_kmem_account(); | 4120 | memcg_resume_kmem_account(); |
4189 | return err; | 4121 | return err; |
4190 | |||
4191 | out_rmid: | ||
4192 | ida_simple_remove(&kmem_limited_groups, memcg_id); | ||
4193 | goto out; | ||
4194 | } | 4122 | } |
4195 | 4123 | ||
4196 | static int memcg_activate_kmem(struct mem_cgroup *memcg, | 4124 | static int memcg_activate_kmem(struct mem_cgroup *memcg, |
@@ -4572,10 +4500,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
4572 | if (!t) | 4500 | if (!t) |
4573 | goto unlock; | 4501 | goto unlock; |
4574 | 4502 | ||
4575 | if (!swap) | 4503 | usage = mem_cgroup_usage(memcg, swap); |
4576 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); | ||
4577 | else | ||
4578 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
4579 | 4504 | ||
4580 | /* | 4505 | /* |
4581 | * current_threshold points to threshold just below or equal to usage. | 4506 | * current_threshold points to threshold just below or equal to usage. |
@@ -4673,10 +4598,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
4673 | 4598 | ||
4674 | if (type == _MEM) { | 4599 | if (type == _MEM) { |
4675 | thresholds = &memcg->thresholds; | 4600 | thresholds = &memcg->thresholds; |
4676 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); | 4601 | usage = mem_cgroup_usage(memcg, false); |
4677 | } else if (type == _MEMSWAP) { | 4602 | } else if (type == _MEMSWAP) { |
4678 | thresholds = &memcg->memsw_thresholds; | 4603 | thresholds = &memcg->memsw_thresholds; |
4679 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 4604 | usage = mem_cgroup_usage(memcg, true); |
4680 | } else | 4605 | } else |
4681 | BUG(); | 4606 | BUG(); |
4682 | 4607 | ||
@@ -4762,10 +4687,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |||
4762 | 4687 | ||
4763 | if (type == _MEM) { | 4688 | if (type == _MEM) { |
4764 | thresholds = &memcg->thresholds; | 4689 | thresholds = &memcg->thresholds; |
4765 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); | 4690 | usage = mem_cgroup_usage(memcg, false); |
4766 | } else if (type == _MEMSWAP) { | 4691 | } else if (type == _MEMSWAP) { |
4767 | thresholds = &memcg->memsw_thresholds; | 4692 | thresholds = &memcg->memsw_thresholds; |
4768 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 4693 | usage = mem_cgroup_usage(memcg, true); |
4769 | } else | 4694 | } else |
4770 | BUG(); | 4695 | BUG(); |
4771 | 4696 | ||
@@ -5502,6 +5427,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5502 | { | 5427 | { |
5503 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5428 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5504 | struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); | 5429 | struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); |
5430 | int ret; | ||
5505 | 5431 | ||
5506 | if (css->id > MEM_CGROUP_ID_MAX) | 5432 | if (css->id > MEM_CGROUP_ID_MAX) |
5507 | return -ENOSPC; | 5433 | return -ENOSPC; |
@@ -5525,9 +5451,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5525 | * core guarantees its existence. | 5451 | * core guarantees its existence. |
5526 | */ | 5452 | */ |
5527 | } else { | 5453 | } else { |
5528 | res_counter_init(&memcg->res, &root_mem_cgroup->res); | 5454 | res_counter_init(&memcg->res, NULL); |
5529 | res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); | 5455 | res_counter_init(&memcg->memsw, NULL); |
5530 | res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); | 5456 | res_counter_init(&memcg->kmem, NULL); |
5531 | /* | 5457 | /* |
5532 | * Deeper hierachy with use_hierarchy == false doesn't make | 5458 | * Deeper hierachy with use_hierarchy == false doesn't make |
5533 | * much sense so let cgroup subsystem know about this | 5459 | * much sense so let cgroup subsystem know about this |
@@ -5538,7 +5464,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5538 | } | 5464 | } |
5539 | mutex_unlock(&memcg_create_mutex); | 5465 | mutex_unlock(&memcg_create_mutex); |
5540 | 5466 | ||
5541 | return memcg_init_kmem(memcg, &memory_cgrp_subsys); | 5467 | ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); |
5468 | if (ret) | ||
5469 | return ret; | ||
5470 | |||
5471 | /* | ||
5472 | * Make sure the memcg is initialized: mem_cgroup_iter() | ||
5473 | * orders reading memcg->initialized against its callers | ||
5474 | * reading the memcg members. | ||
5475 | */ | ||
5476 | smp_store_release(&memcg->initialized, 1); | ||
5477 | |||
5478 | return 0; | ||
5542 | } | 5479 | } |
5543 | 5480 | ||
5544 | /* | 5481 | /* |
@@ -5969,8 +5906,9 @@ static void __mem_cgroup_clear_mc(void) | |||
5969 | /* we must fixup refcnts and charges */ | 5906 | /* we must fixup refcnts and charges */ |
5970 | if (mc.moved_swap) { | 5907 | if (mc.moved_swap) { |
5971 | /* uncharge swap account from the old cgroup */ | 5908 | /* uncharge swap account from the old cgroup */ |
5972 | res_counter_uncharge(&mc.from->memsw, | 5909 | if (!mem_cgroup_is_root(mc.from)) |
5973 | PAGE_SIZE * mc.moved_swap); | 5910 | res_counter_uncharge(&mc.from->memsw, |
5911 | PAGE_SIZE * mc.moved_swap); | ||
5974 | 5912 | ||
5975 | for (i = 0; i < mc.moved_swap; i++) | 5913 | for (i = 0; i < mc.moved_swap; i++) |
5976 | css_put(&mc.from->css); | 5914 | css_put(&mc.from->css); |
@@ -5979,8 +5917,9 @@ static void __mem_cgroup_clear_mc(void) | |||
5979 | * we charged both to->res and to->memsw, so we should | 5917 | * we charged both to->res and to->memsw, so we should |
5980 | * uncharge to->res. | 5918 | * uncharge to->res. |
5981 | */ | 5919 | */ |
5982 | res_counter_uncharge(&mc.to->res, | 5920 | if (!mem_cgroup_is_root(mc.to)) |
5983 | PAGE_SIZE * mc.moved_swap); | 5921 | res_counter_uncharge(&mc.to->res, |
5922 | PAGE_SIZE * mc.moved_swap); | ||
5984 | /* we've already done css_get(mc.to) */ | 5923 | /* we've already done css_get(mc.to) */ |
5985 | mc.moved_swap = 0; | 5924 | mc.moved_swap = 0; |
5986 | } | 5925 | } |
@@ -6345,7 +6284,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) | |||
6345 | rcu_read_lock(); | 6284 | rcu_read_lock(); |
6346 | memcg = mem_cgroup_lookup(id); | 6285 | memcg = mem_cgroup_lookup(id); |
6347 | if (memcg) { | 6286 | if (memcg) { |
6348 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 6287 | if (!mem_cgroup_is_root(memcg)) |
6288 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
6349 | mem_cgroup_swap_statistics(memcg, false); | 6289 | mem_cgroup_swap_statistics(memcg, false); |
6350 | css_put(&memcg->css); | 6290 | css_put(&memcg->css); |
6351 | } | 6291 | } |
@@ -6509,12 +6449,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | |||
6509 | { | 6449 | { |
6510 | unsigned long flags; | 6450 | unsigned long flags; |
6511 | 6451 | ||
6512 | if (nr_mem) | 6452 | if (!mem_cgroup_is_root(memcg)) { |
6513 | res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); | 6453 | if (nr_mem) |
6514 | if (nr_memsw) | 6454 | res_counter_uncharge(&memcg->res, |
6515 | res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); | 6455 | nr_mem * PAGE_SIZE); |
6516 | 6456 | if (nr_memsw) | |
6517 | memcg_oom_recover(memcg); | 6457 | res_counter_uncharge(&memcg->memsw, |
6458 | nr_memsw * PAGE_SIZE); | ||
6459 | memcg_oom_recover(memcg); | ||
6460 | } | ||
6518 | 6461 | ||
6519 | local_irq_save(flags); | 6462 | local_irq_save(flags); |
6520 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); | 6463 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44c6bd201d3a..8639f6b28746 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -148,7 +148,7 @@ static int hwpoison_filter_task(struct page *p) | |||
148 | ino = cgroup_ino(css->cgroup); | 148 | ino = cgroup_ino(css->cgroup); |
149 | css_put(css); | 149 | css_put(css); |
150 | 150 | ||
151 | if (!ino || ino != hwpoison_filter_memcg) | 151 | if (ino != hwpoison_filter_memcg) |
152 | return -EINVAL; | 152 | return -EINVAL; |
153 | 153 | ||
154 | return 0; | 154 | return 0; |
diff --git a/mm/memory.c b/mm/memory.c index ab3537bcfed2..e229970e4223 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -118,6 +118,8 @@ __setup("norandmaps", disable_randmaps); | |||
118 | unsigned long zero_pfn __read_mostly; | 118 | unsigned long zero_pfn __read_mostly; |
119 | unsigned long highest_memmap_pfn __read_mostly; | 119 | unsigned long highest_memmap_pfn __read_mostly; |
120 | 120 | ||
121 | EXPORT_SYMBOL(zero_pfn); | ||
122 | |||
121 | /* | 123 | /* |
122 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() | 124 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() |
123 | */ | 125 | */ |
@@ -751,7 +753,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
751 | unsigned long pfn = pte_pfn(pte); | 753 | unsigned long pfn = pte_pfn(pte); |
752 | 754 | ||
753 | if (HAVE_PTE_SPECIAL) { | 755 | if (HAVE_PTE_SPECIAL) { |
754 | if (likely(!pte_special(pte) || pte_numa(pte))) | 756 | if (likely(!pte_special(pte))) |
755 | goto check_pfn; | 757 | goto check_pfn; |
756 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) | 758 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
757 | return NULL; | 759 | return NULL; |
@@ -777,15 +779,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
777 | } | 779 | } |
778 | } | 780 | } |
779 | 781 | ||
782 | if (is_zero_pfn(pfn)) | ||
783 | return NULL; | ||
780 | check_pfn: | 784 | check_pfn: |
781 | if (unlikely(pfn > highest_memmap_pfn)) { | 785 | if (unlikely(pfn > highest_memmap_pfn)) { |
782 | print_bad_pte(vma, addr, pte, NULL); | 786 | print_bad_pte(vma, addr, pte, NULL); |
783 | return NULL; | 787 | return NULL; |
784 | } | 788 | } |
785 | 789 | ||
786 | if (is_zero_pfn(pfn)) | ||
787 | return NULL; | ||
788 | |||
789 | /* | 790 | /* |
790 | * NOTE! We still have PageReserved() pages in the page tables. | 791 | * NOTE! We still have PageReserved() pages in the page tables. |
791 | * eg. VDSO mappings can cause them to exist. | 792 | * eg. VDSO mappings can cause them to exist. |
@@ -1126,7 +1127,7 @@ again: | |||
1126 | addr) != page->index) { | 1127 | addr) != page->index) { |
1127 | pte_t ptfile = pgoff_to_pte(page->index); | 1128 | pte_t ptfile = pgoff_to_pte(page->index); |
1128 | if (pte_soft_dirty(ptent)) | 1129 | if (pte_soft_dirty(ptent)) |
1129 | pte_file_mksoft_dirty(ptfile); | 1130 | ptfile = pte_file_mksoft_dirty(ptfile); |
1130 | set_pte_at(mm, addr, pte, ptfile); | 1131 | set_pte_at(mm, addr, pte, ptfile); |
1131 | } | 1132 | } |
1132 | if (PageAnon(page)) | 1133 | if (PageAnon(page)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2ff8c2325e96..29d8693d0c61 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) | |||
1307 | /* | 1307 | /* |
1308 | * Confirm all pages in a range [start, end) is belongs to the same zone. | 1308 | * Confirm all pages in a range [start, end) is belongs to the same zone. |
1309 | */ | 1309 | */ |
1310 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | 1310 | int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) |
1311 | { | 1311 | { |
1312 | unsigned long pfn; | 1312 | unsigned long pfn; |
1313 | struct zone *zone = NULL; | 1313 | struct zone *zone = NULL; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8f5330d74f47..e58725aff7e9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -123,25 +123,23 @@ static struct mempolicy default_policy = { | |||
123 | 123 | ||
124 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | 124 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; |
125 | 125 | ||
126 | static struct mempolicy *get_task_policy(struct task_struct *p) | 126 | struct mempolicy *get_task_policy(struct task_struct *p) |
127 | { | 127 | { |
128 | struct mempolicy *pol = p->mempolicy; | 128 | struct mempolicy *pol = p->mempolicy; |
129 | int node; | ||
129 | 130 | ||
130 | if (!pol) { | 131 | if (pol) |
131 | int node = numa_node_id(); | 132 | return pol; |
132 | 133 | ||
133 | if (node != NUMA_NO_NODE) { | 134 | node = numa_node_id(); |
134 | pol = &preferred_node_policy[node]; | 135 | if (node != NUMA_NO_NODE) { |
135 | /* | 136 | pol = &preferred_node_policy[node]; |
136 | * preferred_node_policy is not initialised early in | 137 | /* preferred_node_policy is not initialised early in boot */ |
137 | * boot | 138 | if (pol->mode) |
138 | */ | 139 | return pol; |
139 | if (!pol->mode) | ||
140 | pol = NULL; | ||
141 | } | ||
142 | } | 140 | } |
143 | 141 | ||
144 | return pol; | 142 | return &default_policy; |
145 | } | 143 | } |
146 | 144 | ||
147 | static const struct mempolicy_operations { | 145 | static const struct mempolicy_operations { |
@@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
683 | } | 681 | } |
684 | 682 | ||
685 | if (flags & MPOL_MF_LAZY) { | 683 | if (flags & MPOL_MF_LAZY) { |
686 | change_prot_numa(vma, start, endvma); | 684 | /* Similar to task_numa_work, skip inaccessible VMAs */ |
685 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
686 | change_prot_numa(vma, start, endvma); | ||
687 | goto next; | 687 | goto next; |
688 | } | 688 | } |
689 | 689 | ||
@@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
804 | nodemask_t *nodes) | 804 | nodemask_t *nodes) |
805 | { | 805 | { |
806 | struct mempolicy *new, *old; | 806 | struct mempolicy *new, *old; |
807 | struct mm_struct *mm = current->mm; | ||
808 | NODEMASK_SCRATCH(scratch); | 807 | NODEMASK_SCRATCH(scratch); |
809 | int ret; | 808 | int ret; |
810 | 809 | ||
@@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
816 | ret = PTR_ERR(new); | 815 | ret = PTR_ERR(new); |
817 | goto out; | 816 | goto out; |
818 | } | 817 | } |
819 | /* | 818 | |
820 | * prevent changing our mempolicy while show_numa_maps() | ||
821 | * is using it. | ||
822 | * Note: do_set_mempolicy() can be called at init time | ||
823 | * with no 'mm'. | ||
824 | */ | ||
825 | if (mm) | ||
826 | down_write(&mm->mmap_sem); | ||
827 | task_lock(current); | 819 | task_lock(current); |
828 | ret = mpol_set_nodemask(new, nodes, scratch); | 820 | ret = mpol_set_nodemask(new, nodes, scratch); |
829 | if (ret) { | 821 | if (ret) { |
830 | task_unlock(current); | 822 | task_unlock(current); |
831 | if (mm) | ||
832 | up_write(&mm->mmap_sem); | ||
833 | mpol_put(new); | 823 | mpol_put(new); |
834 | goto out; | 824 | goto out; |
835 | } | 825 | } |
@@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
839 | nodes_weight(new->v.nodes)) | 829 | nodes_weight(new->v.nodes)) |
840 | current->il_next = first_node(new->v.nodes); | 830 | current->il_next = first_node(new->v.nodes); |
841 | task_unlock(current); | 831 | task_unlock(current); |
842 | if (mm) | ||
843 | up_write(&mm->mmap_sem); | ||
844 | |||
845 | mpol_put(old); | 832 | mpol_put(old); |
846 | ret = 0; | 833 | ret = 0; |
847 | out: | 834 | out: |
@@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, | |||
1605 | 1592 | ||
1606 | #endif | 1593 | #endif |
1607 | 1594 | ||
1608 | /* | 1595 | struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, |
1609 | * get_vma_policy(@task, @vma, @addr) | 1596 | unsigned long addr) |
1610 | * @task: task for fallback if vma policy == default | ||
1611 | * @vma: virtual memory area whose policy is sought | ||
1612 | * @addr: address in @vma for shared policy lookup | ||
1613 | * | ||
1614 | * Returns effective policy for a VMA at specified address. | ||
1615 | * Falls back to @task or system default policy, as necessary. | ||
1616 | * Current or other task's task mempolicy and non-shared vma policies must be | ||
1617 | * protected by task_lock(task) by the caller. | ||
1618 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference | ||
1619 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1620 | * freeing by another task. It is the caller's responsibility to free the | ||
1621 | * extra reference for shared policies. | ||
1622 | */ | ||
1623 | struct mempolicy *get_vma_policy(struct task_struct *task, | ||
1624 | struct vm_area_struct *vma, unsigned long addr) | ||
1625 | { | 1597 | { |
1626 | struct mempolicy *pol = get_task_policy(task); | 1598 | struct mempolicy *pol = NULL; |
1627 | 1599 | ||
1628 | if (vma) { | 1600 | if (vma) { |
1629 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1601 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
1630 | struct mempolicy *vpol = vma->vm_ops->get_policy(vma, | 1602 | pol = vma->vm_ops->get_policy(vma, addr); |
1631 | addr); | ||
1632 | if (vpol) | ||
1633 | pol = vpol; | ||
1634 | } else if (vma->vm_policy) { | 1603 | } else if (vma->vm_policy) { |
1635 | pol = vma->vm_policy; | 1604 | pol = vma->vm_policy; |
1636 | 1605 | ||
@@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1644 | mpol_get(pol); | 1613 | mpol_get(pol); |
1645 | } | 1614 | } |
1646 | } | 1615 | } |
1616 | |||
1617 | return pol; | ||
1618 | } | ||
1619 | |||
1620 | /* | ||
1621 | * get_vma_policy(@vma, @addr) | ||
1622 | * @vma: virtual memory area whose policy is sought | ||
1623 | * @addr: address in @vma for shared policy lookup | ||
1624 | * | ||
1625 | * Returns effective policy for a VMA at specified address. | ||
1626 | * Falls back to current->mempolicy or system default policy, as necessary. | ||
1627 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference | ||
1628 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1629 | * freeing by another task. It is the caller's responsibility to free the | ||
1630 | * extra reference for shared policies. | ||
1631 | */ | ||
1632 | static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, | ||
1633 | unsigned long addr) | ||
1634 | { | ||
1635 | struct mempolicy *pol = __get_vma_policy(vma, addr); | ||
1636 | |||
1647 | if (!pol) | 1637 | if (!pol) |
1648 | pol = &default_policy; | 1638 | pol = get_task_policy(current); |
1639 | |||
1649 | return pol; | 1640 | return pol; |
1650 | } | 1641 | } |
1651 | 1642 | ||
1652 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) | 1643 | bool vma_policy_mof(struct vm_area_struct *vma) |
1653 | { | 1644 | { |
1654 | struct mempolicy *pol = get_task_policy(task); | 1645 | struct mempolicy *pol; |
1655 | if (vma) { | ||
1656 | if (vma->vm_ops && vma->vm_ops->get_policy) { | ||
1657 | bool ret = false; | ||
1658 | 1646 | ||
1659 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); | 1647 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
1660 | if (pol && (pol->flags & MPOL_F_MOF)) | 1648 | bool ret = false; |
1661 | ret = true; | ||
1662 | mpol_cond_put(pol); | ||
1663 | 1649 | ||
1664 | return ret; | 1650 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); |
1665 | } else if (vma->vm_policy) { | 1651 | if (pol && (pol->flags & MPOL_F_MOF)) |
1666 | pol = vma->vm_policy; | 1652 | ret = true; |
1667 | } | 1653 | mpol_cond_put(pol); |
1654 | |||
1655 | return ret; | ||
1668 | } | 1656 | } |
1669 | 1657 | ||
1658 | pol = vma->vm_policy; | ||
1670 | if (!pol) | 1659 | if (!pol) |
1671 | return default_policy.flags & MPOL_F_MOF; | 1660 | pol = get_task_policy(current); |
1672 | 1661 | ||
1673 | return pol->flags & MPOL_F_MOF; | 1662 | return pol->flags & MPOL_F_MOF; |
1674 | } | 1663 | } |
@@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1874 | { | 1863 | { |
1875 | struct zonelist *zl; | 1864 | struct zonelist *zl; |
1876 | 1865 | ||
1877 | *mpol = get_vma_policy(current, vma, addr); | 1866 | *mpol = get_vma_policy(vma, addr); |
1878 | *nodemask = NULL; /* assume !MPOL_BIND */ | 1867 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1879 | 1868 | ||
1880 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { | 1869 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
@@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
2029 | unsigned int cpuset_mems_cookie; | 2018 | unsigned int cpuset_mems_cookie; |
2030 | 2019 | ||
2031 | retry_cpuset: | 2020 | retry_cpuset: |
2032 | pol = get_vma_policy(current, vma, addr); | 2021 | pol = get_vma_policy(vma, addr); |
2033 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2022 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2034 | 2023 | ||
2035 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 2024 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
@@ -2046,8 +2035,7 @@ retry_cpuset: | |||
2046 | page = __alloc_pages_nodemask(gfp, order, | 2035 | page = __alloc_pages_nodemask(gfp, order, |
2047 | policy_zonelist(gfp, pol, node), | 2036 | policy_zonelist(gfp, pol, node), |
2048 | policy_nodemask(gfp, pol)); | 2037 | policy_nodemask(gfp, pol)); |
2049 | if (unlikely(mpol_needs_cond_ref(pol))) | 2038 | mpol_cond_put(pol); |
2050 | __mpol_put(pol); | ||
2051 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2039 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2052 | goto retry_cpuset; | 2040 | goto retry_cpuset; |
2053 | return page; | 2041 | return page; |
@@ -2074,12 +2062,12 @@ retry_cpuset: | |||
2074 | */ | 2062 | */ |
2075 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 2063 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
2076 | { | 2064 | { |
2077 | struct mempolicy *pol = get_task_policy(current); | 2065 | struct mempolicy *pol = &default_policy; |
2078 | struct page *page; | 2066 | struct page *page; |
2079 | unsigned int cpuset_mems_cookie; | 2067 | unsigned int cpuset_mems_cookie; |
2080 | 2068 | ||
2081 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 2069 | if (!in_interrupt() && !(gfp & __GFP_THISNODE)) |
2082 | pol = &default_policy; | 2070 | pol = get_task_policy(current); |
2083 | 2071 | ||
2084 | retry_cpuset: | 2072 | retry_cpuset: |
2085 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2073 | cpuset_mems_cookie = read_mems_allowed_begin(); |
@@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2296 | 2284 | ||
2297 | BUG_ON(!vma); | 2285 | BUG_ON(!vma); |
2298 | 2286 | ||
2299 | pol = get_vma_policy(current, vma, addr); | 2287 | pol = get_vma_policy(vma, addr); |
2300 | if (!(pol->flags & MPOL_F_MOF)) | 2288 | if (!(pol->flags & MPOL_F_MOF)) |
2301 | goto out; | 2289 | goto out; |
2302 | 2290 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index f78ec9bd454d..01439953abf5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -146,8 +146,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
146 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 146 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
147 | if (pte_swp_soft_dirty(*ptep)) | 147 | if (pte_swp_soft_dirty(*ptep)) |
148 | pte = pte_mksoft_dirty(pte); | 148 | pte = pte_mksoft_dirty(pte); |
149 | |||
150 | /* Recheck VMA as permissions can change since migration started */ | ||
149 | if (is_write_migration_entry(entry)) | 151 | if (is_write_migration_entry(entry)) |
150 | pte = pte_mkwrite(pte); | 152 | pte = maybe_mkwrite(pte, vma); |
153 | |||
151 | #ifdef CONFIG_HUGETLB_PAGE | 154 | #ifdef CONFIG_HUGETLB_PAGE |
152 | if (PageHuge(new)) { | 155 | if (PageHuge(new)) { |
153 | pte = pte_mkhuge(pte); | 156 | pte = pte_mkhuge(pte); |
@@ -873,7 +876,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
873 | } | 876 | } |
874 | } | 877 | } |
875 | 878 | ||
876 | if (unlikely(balloon_page_movable(page))) { | 879 | if (unlikely(isolated_balloon_page(page))) { |
877 | /* | 880 | /* |
878 | * A ballooned page does not need any special attention from | 881 | * A ballooned page does not need any special attention from |
879 | * physical to virtual reverse mapping procedures. | 882 | * physical to virtual reverse mapping procedures. |
@@ -952,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, | |||
952 | 955 | ||
953 | rc = __unmap_and_move(page, newpage, force, mode); | 956 | rc = __unmap_and_move(page, newpage, force, mode); |
954 | 957 | ||
955 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | ||
956 | /* | ||
957 | * A ballooned page has been migrated already. | ||
958 | * Now, it's the time to wrap-up counters, | ||
959 | * handle the page back to Buddy and return. | ||
960 | */ | ||
961 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
962 | page_is_file_cache(page)); | ||
963 | balloon_page_free(page); | ||
964 | return MIGRATEPAGE_SUCCESS; | ||
965 | } | ||
966 | out: | 958 | out: |
967 | if (rc != -EAGAIN) { | 959 | if (rc != -EAGAIN) { |
968 | /* | 960 | /* |
@@ -985,6 +977,9 @@ out: | |||
985 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { | 977 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { |
986 | ClearPageSwapBacked(newpage); | 978 | ClearPageSwapBacked(newpage); |
987 | put_new_page(newpage, private); | 979 | put_new_page(newpage, private); |
980 | } else if (unlikely(__is_movable_balloon_page(newpage))) { | ||
981 | /* drop our reference, page already in the balloon */ | ||
982 | put_page(newpage); | ||
988 | } else | 983 | } else |
989 | putback_lru_page(newpage); | 984 | putback_lru_page(newpage); |
990 | 985 | ||
diff --git a/mm/mlock.c b/mm/mlock.c index ce84cb0b83ef..03aa8512723b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -233,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
233 | 233 | ||
234 | VM_BUG_ON(start & ~PAGE_MASK); | 234 | VM_BUG_ON(start & ~PAGE_MASK); |
235 | VM_BUG_ON(end & ~PAGE_MASK); | 235 | VM_BUG_ON(end & ~PAGE_MASK); |
236 | VM_BUG_ON(start < vma->vm_start); | 236 | VM_BUG_ON_VMA(start < vma->vm_start, vma); |
237 | VM_BUG_ON(end > vma->vm_end); | 237 | VM_BUG_ON_VMA(end > vma->vm_end, vma); |
238 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 238 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); |
239 | 239 | ||
240 | gup_flags = FOLL_TOUCH | FOLL_MLOCK; | 240 | gup_flags = FOLL_TOUCH | FOLL_MLOCK; |
241 | /* | 241 | /* |
@@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm, | |||
70 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 70 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
71 | * w: (no) no w: (no) no w: (yes) yes w: (no) no | 71 | * w: (no) no w: (no) no w: (yes) yes w: (no) no |
72 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 72 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
73 | * | 73 | * |
74 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 74 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
75 | * w: (no) no w: (no) no w: (copy) copy w: (no) no | 75 | * w: (no) no w: (no) no w: (copy) copy w: (no) no |
76 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 76 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
@@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len); | |||
268 | 268 | ||
269 | SYSCALL_DEFINE1(brk, unsigned long, brk) | 269 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
270 | { | 270 | { |
271 | unsigned long rlim, retval; | 271 | unsigned long retval; |
272 | unsigned long newbrk, oldbrk; | 272 | unsigned long newbrk, oldbrk; |
273 | struct mm_struct *mm = current->mm; | 273 | struct mm_struct *mm = current->mm; |
274 | unsigned long min_brk; | 274 | unsigned long min_brk; |
@@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
298 | * segment grow beyond its set limit the in case where the limit is | 298 | * segment grow beyond its set limit the in case where the limit is |
299 | * not page aligned -Ram Gupta | 299 | * not page aligned -Ram Gupta |
300 | */ | 300 | */ |
301 | rlim = rlimit(RLIMIT_DATA); | 301 | if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, |
302 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 302 | mm->end_data, mm->start_data)) |
303 | (mm->end_data - mm->start_data) > rlim) | ||
304 | goto out; | 303 | goto out; |
305 | 304 | ||
306 | newbrk = PAGE_ALIGN(brk); | 305 | newbrk = PAGE_ALIGN(brk); |
@@ -369,20 +368,22 @@ static int browse_rb(struct rb_root *root) | |||
369 | struct vm_area_struct *vma; | 368 | struct vm_area_struct *vma; |
370 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 369 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
371 | if (vma->vm_start < prev) { | 370 | if (vma->vm_start < prev) { |
372 | pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); | 371 | pr_emerg("vm_start %lx < prev %lx\n", |
372 | vma->vm_start, prev); | ||
373 | bug = 1; | 373 | bug = 1; |
374 | } | 374 | } |
375 | if (vma->vm_start < pend) { | 375 | if (vma->vm_start < pend) { |
376 | pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); | 376 | pr_emerg("vm_start %lx < pend %lx\n", |
377 | vma->vm_start, pend); | ||
377 | bug = 1; | 378 | bug = 1; |
378 | } | 379 | } |
379 | if (vma->vm_start > vma->vm_end) { | 380 | if (vma->vm_start > vma->vm_end) { |
380 | pr_info("vm_end %lx < vm_start %lx\n", | 381 | pr_emerg("vm_start %lx > vm_end %lx\n", |
381 | vma->vm_end, vma->vm_start); | 382 | vma->vm_start, vma->vm_end); |
382 | bug = 1; | 383 | bug = 1; |
383 | } | 384 | } |
384 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { | 385 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { |
385 | pr_info("free gap %lx, correct %lx\n", | 386 | pr_emerg("free gap %lx, correct %lx\n", |
386 | vma->rb_subtree_gap, | 387 | vma->rb_subtree_gap, |
387 | vma_compute_subtree_gap(vma)); | 388 | vma_compute_subtree_gap(vma)); |
388 | bug = 1; | 389 | bug = 1; |
@@ -396,7 +397,7 @@ static int browse_rb(struct rb_root *root) | |||
396 | for (nd = pn; nd; nd = rb_prev(nd)) | 397 | for (nd = pn; nd; nd = rb_prev(nd)) |
397 | j++; | 398 | j++; |
398 | if (i != j) { | 399 | if (i != j) { |
399 | pr_info("backwards %d, forwards %d\n", j, i); | 400 | pr_emerg("backwards %d, forwards %d\n", j, i); |
400 | bug = 1; | 401 | bug = 1; |
401 | } | 402 | } |
402 | return bug ? -1 : i; | 403 | return bug ? -1 : i; |
@@ -409,8 +410,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) | |||
409 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 410 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
410 | struct vm_area_struct *vma; | 411 | struct vm_area_struct *vma; |
411 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 412 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
412 | BUG_ON(vma != ignore && | 413 | VM_BUG_ON_VMA(vma != ignore && |
413 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); | 414 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma), |
415 | vma); | ||
414 | } | 416 | } |
415 | } | 417 | } |
416 | 418 | ||
@@ -420,8 +422,10 @@ static void validate_mm(struct mm_struct *mm) | |||
420 | int i = 0; | 422 | int i = 0; |
421 | unsigned long highest_address = 0; | 423 | unsigned long highest_address = 0; |
422 | struct vm_area_struct *vma = mm->mmap; | 424 | struct vm_area_struct *vma = mm->mmap; |
425 | |||
423 | while (vma) { | 426 | while (vma) { |
424 | struct anon_vma_chain *avc; | 427 | struct anon_vma_chain *avc; |
428 | |||
425 | vma_lock_anon_vma(vma); | 429 | vma_lock_anon_vma(vma); |
426 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 430 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
427 | anon_vma_interval_tree_verify(avc); | 431 | anon_vma_interval_tree_verify(avc); |
@@ -431,20 +435,21 @@ static void validate_mm(struct mm_struct *mm) | |||
431 | i++; | 435 | i++; |
432 | } | 436 | } |
433 | if (i != mm->map_count) { | 437 | if (i != mm->map_count) { |
434 | pr_info("map_count %d vm_next %d\n", mm->map_count, i); | 438 | pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); |
435 | bug = 1; | 439 | bug = 1; |
436 | } | 440 | } |
437 | if (highest_address != mm->highest_vm_end) { | 441 | if (highest_address != mm->highest_vm_end) { |
438 | pr_info("mm->highest_vm_end %lx, found %lx\n", | 442 | pr_emerg("mm->highest_vm_end %lx, found %lx\n", |
439 | mm->highest_vm_end, highest_address); | 443 | mm->highest_vm_end, highest_address); |
440 | bug = 1; | 444 | bug = 1; |
441 | } | 445 | } |
442 | i = browse_rb(&mm->mm_rb); | 446 | i = browse_rb(&mm->mm_rb); |
443 | if (i != mm->map_count) { | 447 | if (i != mm->map_count) { |
444 | pr_info("map_count %d rb %d\n", mm->map_count, i); | 448 | if (i != -1) |
449 | pr_emerg("map_count %d rb %d\n", mm->map_count, i); | ||
445 | bug = 1; | 450 | bug = 1; |
446 | } | 451 | } |
447 | BUG_ON(bug); | 452 | VM_BUG_ON_MM(bug, mm); |
448 | } | 453 | } |
449 | #else | 454 | #else |
450 | #define validate_mm_rb(root, ignore) do { } while (0) | 455 | #define validate_mm_rb(root, ignore) do { } while (0) |
@@ -741,7 +746,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
741 | * split_vma inserting another: so it must be | 746 | * split_vma inserting another: so it must be |
742 | * mprotect case 4 shifting the boundary down. | 747 | * mprotect case 4 shifting the boundary down. |
743 | */ | 748 | */ |
744 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); | 749 | adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); |
745 | exporter = vma; | 750 | exporter = vma; |
746 | importer = next; | 751 | importer = next; |
747 | } | 752 | } |
@@ -787,8 +792,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
787 | if (!anon_vma && adjust_next) | 792 | if (!anon_vma && adjust_next) |
788 | anon_vma = next->anon_vma; | 793 | anon_vma = next->anon_vma; |
789 | if (anon_vma) { | 794 | if (anon_vma) { |
790 | VM_BUG_ON(adjust_next && next->anon_vma && | 795 | VM_BUG_ON_VMA(adjust_next && next->anon_vma && |
791 | anon_vma != next->anon_vma); | 796 | anon_vma != next->anon_vma, next); |
792 | anon_vma_lock_write(anon_vma); | 797 | anon_vma_lock_write(anon_vma); |
793 | anon_vma_interval_tree_pre_update_vma(vma); | 798 | anon_vma_interval_tree_pre_update_vma(vma); |
794 | if (adjust_next) | 799 | if (adjust_next) |
@@ -1010,7 +1015,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | |||
1010 | struct vm_area_struct *vma_merge(struct mm_struct *mm, | 1015 | struct vm_area_struct *vma_merge(struct mm_struct *mm, |
1011 | struct vm_area_struct *prev, unsigned long addr, | 1016 | struct vm_area_struct *prev, unsigned long addr, |
1012 | unsigned long end, unsigned long vm_flags, | 1017 | unsigned long end, unsigned long vm_flags, |
1013 | struct anon_vma *anon_vma, struct file *file, | 1018 | struct anon_vma *anon_vma, struct file *file, |
1014 | pgoff_t pgoff, struct mempolicy *policy) | 1019 | pgoff_t pgoff, struct mempolicy *policy) |
1015 | { | 1020 | { |
1016 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 1021 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
@@ -1036,7 +1041,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1036 | * Can it merge with the predecessor? | 1041 | * Can it merge with the predecessor? |
1037 | */ | 1042 | */ |
1038 | if (prev && prev->vm_end == addr && | 1043 | if (prev && prev->vm_end == addr && |
1039 | mpol_equal(vma_policy(prev), policy) && | 1044 | mpol_equal(vma_policy(prev), policy) && |
1040 | can_vma_merge_after(prev, vm_flags, | 1045 | can_vma_merge_after(prev, vm_flags, |
1041 | anon_vma, file, pgoff)) { | 1046 | anon_vma, file, pgoff)) { |
1042 | /* | 1047 | /* |
@@ -1064,7 +1069,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1064 | * Can this new request be merged in front of next? | 1069 | * Can this new request be merged in front of next? |
1065 | */ | 1070 | */ |
1066 | if (next && end == next->vm_start && | 1071 | if (next && end == next->vm_start && |
1067 | mpol_equal(policy, vma_policy(next)) && | 1072 | mpol_equal(policy, vma_policy(next)) && |
1068 | can_vma_merge_before(next, vm_flags, | 1073 | can_vma_merge_before(next, vm_flags, |
1069 | anon_vma, file, pgoff+pglen)) { | 1074 | anon_vma, file, pgoff+pglen)) { |
1070 | if (prev && addr < prev->vm_end) /* case 4 */ | 1075 | if (prev && addr < prev->vm_end) /* case 4 */ |
@@ -1235,7 +1240,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1235 | unsigned long flags, unsigned long pgoff, | 1240 | unsigned long flags, unsigned long pgoff, |
1236 | unsigned long *populate) | 1241 | unsigned long *populate) |
1237 | { | 1242 | { |
1238 | struct mm_struct * mm = current->mm; | 1243 | struct mm_struct *mm = current->mm; |
1239 | vm_flags_t vm_flags; | 1244 | vm_flags_t vm_flags; |
1240 | 1245 | ||
1241 | *populate = 0; | 1246 | *populate = 0; |
@@ -1263,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1263 | 1268 | ||
1264 | /* offset overflow? */ | 1269 | /* offset overflow? */ |
1265 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 1270 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) |
1266 | return -EOVERFLOW; | 1271 | return -EOVERFLOW; |
1267 | 1272 | ||
1268 | /* Too many mappings? */ | 1273 | /* Too many mappings? */ |
1269 | if (mm->map_count > sysctl_max_map_count) | 1274 | if (mm->map_count > sysctl_max_map_count) |
@@ -1921,7 +1926,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1921 | info.align_mask = 0; | 1926 | info.align_mask = 0; |
1922 | return vm_unmapped_area(&info); | 1927 | return vm_unmapped_area(&info); |
1923 | } | 1928 | } |
1924 | #endif | 1929 | #endif |
1925 | 1930 | ||
1926 | /* | 1931 | /* |
1927 | * This mmap-allocator allocates new areas top-down from below the | 1932 | * This mmap-allocator allocates new areas top-down from below the |
@@ -2321,13 +2326,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) | |||
2321 | } | 2326 | } |
2322 | 2327 | ||
2323 | struct vm_area_struct * | 2328 | struct vm_area_struct * |
2324 | find_extend_vma(struct mm_struct * mm, unsigned long addr) | 2329 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
2325 | { | 2330 | { |
2326 | struct vm_area_struct * vma; | 2331 | struct vm_area_struct *vma; |
2327 | unsigned long start; | 2332 | unsigned long start; |
2328 | 2333 | ||
2329 | addr &= PAGE_MASK; | 2334 | addr &= PAGE_MASK; |
2330 | vma = find_vma(mm,addr); | 2335 | vma = find_vma(mm, addr); |
2331 | if (!vma) | 2336 | if (!vma) |
2332 | return NULL; | 2337 | return NULL; |
2333 | if (vma->vm_start <= addr) | 2338 | if (vma->vm_start <= addr) |
@@ -2376,7 +2381,7 @@ static void unmap_region(struct mm_struct *mm, | |||
2376 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 2381 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
2377 | unsigned long start, unsigned long end) | 2382 | unsigned long start, unsigned long end) |
2378 | { | 2383 | { |
2379 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | 2384 | struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; |
2380 | struct mmu_gather tlb; | 2385 | struct mmu_gather tlb; |
2381 | 2386 | ||
2382 | lru_add_drain(); | 2387 | lru_add_drain(); |
@@ -2423,7 +2428,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2423 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the | 2428 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
2424 | * munmap path where it doesn't make sense to fail. | 2429 | * munmap path where it doesn't make sense to fail. |
2425 | */ | 2430 | */ |
2426 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 2431 | static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
2427 | unsigned long addr, int new_below) | 2432 | unsigned long addr, int new_below) |
2428 | { | 2433 | { |
2429 | struct vm_area_struct *new; | 2434 | struct vm_area_struct *new; |
@@ -2512,7 +2517,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2512 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) | 2517 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) |
2513 | return -EINVAL; | 2518 | return -EINVAL; |
2514 | 2519 | ||
2515 | if ((len = PAGE_ALIGN(len)) == 0) | 2520 | len = PAGE_ALIGN(len); |
2521 | if (len == 0) | ||
2516 | return -EINVAL; | 2522 | return -EINVAL; |
2517 | 2523 | ||
2518 | /* Find the first overlapping VMA */ | 2524 | /* Find the first overlapping VMA */ |
@@ -2558,7 +2564,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2558 | if (error) | 2564 | if (error) |
2559 | return error; | 2565 | return error; |
2560 | } | 2566 | } |
2561 | vma = prev? prev->vm_next: mm->mmap; | 2567 | vma = prev ? prev->vm_next : mm->mmap; |
2562 | 2568 | ||
2563 | /* | 2569 | /* |
2564 | * unlock any mlock()ed ranges before detaching vmas | 2570 | * unlock any mlock()ed ranges before detaching vmas |
@@ -2621,10 +2627,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) | |||
2621 | */ | 2627 | */ |
2622 | static unsigned long do_brk(unsigned long addr, unsigned long len) | 2628 | static unsigned long do_brk(unsigned long addr, unsigned long len) |
2623 | { | 2629 | { |
2624 | struct mm_struct * mm = current->mm; | 2630 | struct mm_struct *mm = current->mm; |
2625 | struct vm_area_struct * vma, * prev; | 2631 | struct vm_area_struct *vma, *prev; |
2626 | unsigned long flags; | 2632 | unsigned long flags; |
2627 | struct rb_node ** rb_link, * rb_parent; | 2633 | struct rb_node **rb_link, *rb_parent; |
2628 | pgoff_t pgoff = addr >> PAGE_SHIFT; | 2634 | pgoff_t pgoff = addr >> PAGE_SHIFT; |
2629 | int error; | 2635 | int error; |
2630 | 2636 | ||
@@ -2848,7 +2854,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2848 | * safe. It is only safe to keep the vm_pgoff | 2854 | * safe. It is only safe to keep the vm_pgoff |
2849 | * linear if there are no pages mapped yet. | 2855 | * linear if there are no pages mapped yet. |
2850 | */ | 2856 | */ |
2851 | VM_BUG_ON(faulted_in_anon_vma); | 2857 | VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); |
2852 | *vmap = vma = new_vma; | 2858 | *vmap = vma = new_vma; |
2853 | } | 2859 | } |
2854 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); | 2860 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
@@ -3196,7 +3202,7 @@ void __init mmap_init(void) | |||
3196 | { | 3202 | { |
3197 | int ret; | 3203 | int ret; |
3198 | 3204 | ||
3199 | ret = percpu_counter_init(&vm_committed_as, 0); | 3205 | ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); |
3200 | VM_BUG_ON(ret); | 3206 | VM_BUG_ON(ret); |
3201 | } | 3207 | } |
3202 | 3208 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 950813b1eb36..2c8da9825fe3 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -107,7 +107,8 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
107 | * existed or not. | 107 | * existed or not. |
108 | */ | 108 | */ |
109 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | 109 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, |
110 | unsigned long address) | 110 | unsigned long start, |
111 | unsigned long end) | ||
111 | { | 112 | { |
112 | struct mmu_notifier *mn; | 113 | struct mmu_notifier *mn; |
113 | int young = 0, id; | 114 | int young = 0, id; |
@@ -115,7 +116,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
115 | id = srcu_read_lock(&srcu); | 116 | id = srcu_read_lock(&srcu); |
116 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { | 117 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { |
117 | if (mn->ops->clear_flush_young) | 118 | if (mn->ops->clear_flush_young) |
118 | young |= mn->ops->clear_flush_young(mn, mm, address); | 119 | young |= mn->ops->clear_flush_young(mn, mm, start, end); |
119 | } | 120 | } |
120 | srcu_read_unlock(&srcu, id); | 121 | srcu_read_unlock(&srcu, id); |
121 | 122 | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 05f1180e9f21..b147f66f4c40 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -21,8 +21,8 @@ | |||
21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | #include <linux/mmu_notifier.h> | 22 | #include <linux/mmu_notifier.h> |
23 | #include <linux/sched/sysctl.h> | 23 | #include <linux/sched/sysctl.h> |
24 | #include <linux/uaccess.h> | ||
24 | 25 | ||
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | 28 | ||
@@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
195 | if (pmd_trans_huge(*old_pmd)) { | 195 | if (pmd_trans_huge(*old_pmd)) { |
196 | int err = 0; | 196 | int err = 0; |
197 | if (extent == HPAGE_PMD_SIZE) { | 197 | if (extent == HPAGE_PMD_SIZE) { |
198 | VM_BUG_ON(vma->vm_file || !vma->anon_vma); | 198 | VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, |
199 | vma); | ||
199 | /* See comment in move_ptes() */ | 200 | /* See comment in move_ptes() */ |
200 | if (need_rmap_locks) | 201 | if (need_rmap_locks) |
201 | anon_vma_lock_write(vma->anon_vma); | 202 | anon_vma_lock_write(vma->anon_vma); |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7ed58602e71b..7c7ab32ee503 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void) | |||
119 | phys_addr_t start, end; | 119 | phys_addr_t start, end; |
120 | u64 i; | 120 | u64 i; |
121 | 121 | ||
122 | memblock_clear_hotplug(0, -1); | ||
123 | |||
122 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) | 124 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) |
123 | count += __free_memory_core(start, end); | 125 | count += __free_memory_core(start, end); |
124 | 126 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index a881d9673c6b..bd1808e194a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -539,7 +539,7 @@ void __init mmap_init(void) | |||
539 | { | 539 | { |
540 | int ret; | 540 | int ret; |
541 | 541 | ||
542 | ret = percpu_counter_init(&vm_committed_as, 0); | 542 | ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); |
543 | VM_BUG_ON(ret); | 543 | VM_BUG_ON(ret); |
544 | vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); | 544 | vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); |
545 | } | 545 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e11df8fa7ec..bbf405a3a18f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
565 | 565 | ||
566 | spin_lock(&zone_scan_lock); | 566 | spin_lock(&zone_scan_lock); |
567 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | 567 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
568 | if (zone_is_oom_locked(zone)) { | 568 | if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { |
569 | ret = false; | 569 | ret = false; |
570 | goto out; | 570 | goto out; |
571 | } | 571 | } |
@@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
575 | * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. | 575 | * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. |
576 | */ | 576 | */ |
577 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | 577 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
578 | zone_set_flag(zone, ZONE_OOM_LOCKED); | 578 | set_bit(ZONE_OOM_LOCKED, &zone->flags); |
579 | 579 | ||
580 | out: | 580 | out: |
581 | spin_unlock(&zone_scan_lock); | 581 | spin_unlock(&zone_scan_lock); |
@@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
594 | 594 | ||
595 | spin_lock(&zone_scan_lock); | 595 | spin_lock(&zone_scan_lock); |
596 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | 596 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
597 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | 597 | clear_bit(ZONE_OOM_LOCKED, &zone->flags); |
598 | spin_unlock(&zone_scan_lock); | 598 | spin_unlock(&zone_scan_lock); |
599 | } | 599 | } |
600 | 600 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef1744d..ff24c9d83112 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
1075 | } | 1075 | } |
1076 | 1076 | ||
1077 | if (dirty < setpoint) { | 1077 | if (dirty < setpoint) { |
1078 | x = min(bdi->balanced_dirty_ratelimit, | 1078 | x = min3(bdi->balanced_dirty_ratelimit, |
1079 | min(balanced_dirty_ratelimit, task_ratelimit)); | 1079 | balanced_dirty_ratelimit, task_ratelimit); |
1080 | if (dirty_ratelimit < x) | 1080 | if (dirty_ratelimit < x) |
1081 | step = x - dirty_ratelimit; | 1081 | step = x - dirty_ratelimit; |
1082 | } else { | 1082 | } else { |
1083 | x = max(bdi->balanced_dirty_ratelimit, | 1083 | x = max3(bdi->balanced_dirty_ratelimit, |
1084 | max(balanced_dirty_ratelimit, task_ratelimit)); | 1084 | balanced_dirty_ratelimit, task_ratelimit); |
1085 | if (dirty_ratelimit > x) | 1085 | if (dirty_ratelimit > x) |
1086 | step = dirty_ratelimit - x; | 1086 | step = dirty_ratelimit - x; |
1087 | } | 1087 | } |
@@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) | |||
1777 | writeback_set_ratelimit(); | 1777 | writeback_set_ratelimit(); |
1778 | register_cpu_notifier(&ratelimit_nb); | 1778 | register_cpu_notifier(&ratelimit_nb); |
1779 | 1779 | ||
1780 | fprop_global_init(&writeout_completions); | 1780 | fprop_global_init(&writeout_completions, GFP_KERNEL); |
1781 | } | 1781 | } |
1782 | 1782 | ||
1783 | /** | 1783 | /** |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18cee0d4c8a2..c9710c9bbee2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -53,8 +53,6 @@ | |||
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
55 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
56 | #include <linux/ftrace_event.h> | ||
57 | #include <linux/memcontrol.h> | ||
58 | #include <linux/prefetch.h> | 56 | #include <linux/prefetch.h> |
59 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
60 | #include <linux/migrate.h> | 58 | #include <linux/migrate.h> |
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node); | |||
85 | */ | 83 | */ |
86 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ | 84 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
87 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); | 85 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
86 | int _node_numa_mem_[MAX_NUMNODES]; | ||
88 | #endif | 87 | #endif |
89 | 88 | ||
90 | /* | 89 | /* |
@@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone, | |||
1014 | * Remove at a later date when no bug reports exist related to | 1013 | * Remove at a later date when no bug reports exist related to |
1015 | * grouping pages by mobility | 1014 | * grouping pages by mobility |
1016 | */ | 1015 | */ |
1017 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | 1016 | VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); |
1018 | #endif | 1017 | #endif |
1019 | 1018 | ||
1020 | for (page = start_page; page <= end_page;) { | 1019 | for (page = start_page; page <= end_page;) { |
@@ -1612,9 +1611,9 @@ again: | |||
1612 | } | 1611 | } |
1613 | 1612 | ||
1614 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 1613 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
1615 | if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && | 1614 | if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && |
1616 | !zone_is_fair_depleted(zone)) | 1615 | !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) |
1617 | zone_set_flag(zone, ZONE_FAIR_DEPLETED); | 1616 | set_bit(ZONE_FAIR_DEPLETED, &zone->flags); |
1618 | 1617 | ||
1619 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1618 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1620 | zone_statistics(preferred_zone, zone, gfp_flags); | 1619 | zone_statistics(preferred_zone, zone, gfp_flags); |
@@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone) | |||
1934 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | 1933 | mod_zone_page_state(zone, NR_ALLOC_BATCH, |
1935 | high_wmark_pages(zone) - low_wmark_pages(zone) - | 1934 | high_wmark_pages(zone) - low_wmark_pages(zone) - |
1936 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | 1935 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); |
1937 | zone_clear_flag(zone, ZONE_FAIR_DEPLETED); | 1936 | clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); |
1938 | } while (zone++ != preferred_zone); | 1937 | } while (zone++ != preferred_zone); |
1939 | } | 1938 | } |
1940 | 1939 | ||
@@ -1985,7 +1984,7 @@ zonelist_scan: | |||
1985 | if (alloc_flags & ALLOC_FAIR) { | 1984 | if (alloc_flags & ALLOC_FAIR) { |
1986 | if (!zone_local(preferred_zone, zone)) | 1985 | if (!zone_local(preferred_zone, zone)) |
1987 | break; | 1986 | break; |
1988 | if (zone_is_fair_depleted(zone)) { | 1987 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { |
1989 | nr_fair_skipped++; | 1988 | nr_fair_skipped++; |
1990 | continue; | 1989 | continue; |
1991 | } | 1990 | } |
@@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2296 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2295 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2297 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2296 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2298 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2297 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2299 | bool *contended_compaction, bool *deferred_compaction, | 2298 | int *contended_compaction, bool *deferred_compaction) |
2300 | unsigned long *did_some_progress) | ||
2301 | { | 2299 | { |
2302 | if (!order) | 2300 | struct zone *last_compact_zone = NULL; |
2303 | return NULL; | 2301 | unsigned long compact_result; |
2302 | struct page *page; | ||
2304 | 2303 | ||
2305 | if (compaction_deferred(preferred_zone, order)) { | 2304 | if (!order) |
2306 | *deferred_compaction = true; | ||
2307 | return NULL; | 2305 | return NULL; |
2308 | } | ||
2309 | 2306 | ||
2310 | current->flags |= PF_MEMALLOC; | 2307 | current->flags |= PF_MEMALLOC; |
2311 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2308 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, |
2312 | nodemask, mode, | 2309 | nodemask, mode, |
2313 | contended_compaction); | 2310 | contended_compaction, |
2311 | &last_compact_zone); | ||
2314 | current->flags &= ~PF_MEMALLOC; | 2312 | current->flags &= ~PF_MEMALLOC; |
2315 | 2313 | ||
2316 | if (*did_some_progress != COMPACT_SKIPPED) { | 2314 | switch (compact_result) { |
2317 | struct page *page; | 2315 | case COMPACT_DEFERRED: |
2316 | *deferred_compaction = true; | ||
2317 | /* fall-through */ | ||
2318 | case COMPACT_SKIPPED: | ||
2319 | return NULL; | ||
2320 | default: | ||
2321 | break; | ||
2322 | } | ||
2318 | 2323 | ||
2319 | /* Page migration frees to the PCP lists but we want merging */ | 2324 | /* |
2320 | drain_pages(get_cpu()); | 2325 | * At least in one zone compaction wasn't deferred or skipped, so let's |
2321 | put_cpu(); | 2326 | * count a compaction stall |
2327 | */ | ||
2328 | count_vm_event(COMPACTSTALL); | ||
2322 | 2329 | ||
2323 | page = get_page_from_freelist(gfp_mask, nodemask, | 2330 | /* Page migration frees to the PCP lists but we want merging */ |
2324 | order, zonelist, high_zoneidx, | 2331 | drain_pages(get_cpu()); |
2325 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2332 | put_cpu(); |
2326 | preferred_zone, classzone_idx, migratetype); | ||
2327 | if (page) { | ||
2328 | preferred_zone->compact_blockskip_flush = false; | ||
2329 | compaction_defer_reset(preferred_zone, order, true); | ||
2330 | count_vm_event(COMPACTSUCCESS); | ||
2331 | return page; | ||
2332 | } | ||
2333 | 2333 | ||
2334 | /* | 2334 | page = get_page_from_freelist(gfp_mask, nodemask, |
2335 | * It's bad if compaction run occurs and fails. | 2335 | order, zonelist, high_zoneidx, |
2336 | * The most likely reason is that pages exist, | 2336 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2337 | * but not enough to satisfy watermarks. | 2337 | preferred_zone, classzone_idx, migratetype); |
2338 | */ | ||
2339 | count_vm_event(COMPACTFAIL); | ||
2340 | 2338 | ||
2341 | /* | 2339 | if (page) { |
2342 | * As async compaction considers a subset of pageblocks, only | 2340 | struct zone *zone = page_zone(page); |
2343 | * defer if the failure was a sync compaction failure. | ||
2344 | */ | ||
2345 | if (mode != MIGRATE_ASYNC) | ||
2346 | defer_compaction(preferred_zone, order); | ||
2347 | 2341 | ||
2348 | cond_resched(); | 2342 | zone->compact_blockskip_flush = false; |
2343 | compaction_defer_reset(zone, order, true); | ||
2344 | count_vm_event(COMPACTSUCCESS); | ||
2345 | return page; | ||
2349 | } | 2346 | } |
2350 | 2347 | ||
2348 | /* | ||
2349 | * last_compact_zone is where try_to_compact_pages thought allocation | ||
2350 | * should succeed, so it did not defer compaction. But here we know | ||
2351 | * that it didn't succeed, so we do the defer. | ||
2352 | */ | ||
2353 | if (last_compact_zone && mode != MIGRATE_ASYNC) | ||
2354 | defer_compaction(last_compact_zone, order); | ||
2355 | |||
2356 | /* | ||
2357 | * It's bad if compaction run occurs and fails. The most likely reason | ||
2358 | * is that pages exist, but not enough to satisfy watermarks. | ||
2359 | */ | ||
2360 | count_vm_event(COMPACTFAIL); | ||
2361 | |||
2362 | cond_resched(); | ||
2363 | |||
2351 | return NULL; | 2364 | return NULL; |
2352 | } | 2365 | } |
2353 | #else | 2366 | #else |
@@ -2355,9 +2368,8 @@ static inline struct page * | |||
2355 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2368 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2356 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2369 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2357 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2370 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2358 | int classzone_idx, int migratetype, | 2371 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2359 | enum migrate_mode mode, bool *contended_compaction, | 2372 | int *contended_compaction, bool *deferred_compaction) |
2360 | bool *deferred_compaction, unsigned long *did_some_progress) | ||
2361 | { | 2373 | { |
2362 | return NULL; | 2374 | return NULL; |
2363 | } | 2375 | } |
@@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
2457 | static void wake_all_kswapds(unsigned int order, | 2469 | static void wake_all_kswapds(unsigned int order, |
2458 | struct zonelist *zonelist, | 2470 | struct zonelist *zonelist, |
2459 | enum zone_type high_zoneidx, | 2471 | enum zone_type high_zoneidx, |
2460 | struct zone *preferred_zone) | 2472 | struct zone *preferred_zone, |
2473 | nodemask_t *nodemask) | ||
2461 | { | 2474 | { |
2462 | struct zoneref *z; | 2475 | struct zoneref *z; |
2463 | struct zone *zone; | 2476 | struct zone *zone; |
2464 | 2477 | ||
2465 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 2478 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2479 | high_zoneidx, nodemask) | ||
2466 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | 2480 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); |
2467 | } | 2481 | } |
2468 | 2482 | ||
@@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2509 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2523 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2510 | } | 2524 | } |
2511 | #ifdef CONFIG_CMA | 2525 | #ifdef CONFIG_CMA |
2512 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 2526 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
2513 | alloc_flags |= ALLOC_CMA; | 2527 | alloc_flags |= ALLOC_CMA; |
2514 | #endif | 2528 | #endif |
2515 | return alloc_flags; | 2529 | return alloc_flags; |
@@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2533 | unsigned long did_some_progress; | 2547 | unsigned long did_some_progress; |
2534 | enum migrate_mode migration_mode = MIGRATE_ASYNC; | 2548 | enum migrate_mode migration_mode = MIGRATE_ASYNC; |
2535 | bool deferred_compaction = false; | 2549 | bool deferred_compaction = false; |
2536 | bool contended_compaction = false; | 2550 | int contended_compaction = COMPACT_CONTENDED_NONE; |
2537 | 2551 | ||
2538 | /* | 2552 | /* |
2539 | * In the slowpath, we sanity check order to avoid ever trying to | 2553 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2560 | 2574 | ||
2561 | restart: | 2575 | restart: |
2562 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2576 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2563 | wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); | 2577 | wake_all_kswapds(order, zonelist, high_zoneidx, |
2578 | preferred_zone, nodemask); | ||
2564 | 2579 | ||
2565 | /* | 2580 | /* |
2566 | * OK, we're below the kswapd watermark and have kicked background | 2581 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2633,20 +2648,40 @@ rebalance: | |||
2633 | preferred_zone, | 2648 | preferred_zone, |
2634 | classzone_idx, migratetype, | 2649 | classzone_idx, migratetype, |
2635 | migration_mode, &contended_compaction, | 2650 | migration_mode, &contended_compaction, |
2636 | &deferred_compaction, | 2651 | &deferred_compaction); |
2637 | &did_some_progress); | ||
2638 | if (page) | 2652 | if (page) |
2639 | goto got_pg; | 2653 | goto got_pg; |
2640 | 2654 | ||
2641 | /* | 2655 | /* Checks for THP-specific high-order allocations */ |
2642 | * If compaction is deferred for high-order allocations, it is because | 2656 | if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { |
2643 | * sync compaction recently failed. In this is the case and the caller | 2657 | /* |
2644 | * requested a movable allocation that does not heavily disrupt the | 2658 | * If compaction is deferred for high-order allocations, it is |
2645 | * system then fail the allocation instead of entering direct reclaim. | 2659 | * because sync compaction recently failed. If this is the case |
2646 | */ | 2660 | * and the caller requested a THP allocation, we do not want |
2647 | if ((deferred_compaction || contended_compaction) && | 2661 | * to heavily disrupt the system, so we fail the allocation |
2648 | (gfp_mask & __GFP_NO_KSWAPD)) | 2662 | * instead of entering direct reclaim. |
2649 | goto nopage; | 2663 | */ |
2664 | if (deferred_compaction) | ||
2665 | goto nopage; | ||
2666 | |||
2667 | /* | ||
2668 | * In all zones where compaction was attempted (and not | ||
2669 | * deferred or skipped), lock contention has been detected. | ||
2670 | * For THP allocation we do not want to disrupt the others | ||
2671 | * so we fallback to base pages instead. | ||
2672 | */ | ||
2673 | if (contended_compaction == COMPACT_CONTENDED_LOCK) | ||
2674 | goto nopage; | ||
2675 | |||
2676 | /* | ||
2677 | * If compaction was aborted due to need_resched(), we do not | ||
2678 | * want to further increase allocation latency, unless it is | ||
2679 | * khugepaged trying to collapse. | ||
2680 | */ | ||
2681 | if (contended_compaction == COMPACT_CONTENDED_SCHED | ||
2682 | && !(current->flags & PF_KTHREAD)) | ||
2683 | goto nopage; | ||
2684 | } | ||
2650 | 2685 | ||
2651 | /* | 2686 | /* |
2652 | * It can become very expensive to allocate transparent hugepages at | 2687 | * It can become very expensive to allocate transparent hugepages at |
@@ -2726,8 +2761,7 @@ rebalance: | |||
2726 | preferred_zone, | 2761 | preferred_zone, |
2727 | classzone_idx, migratetype, | 2762 | classzone_idx, migratetype, |
2728 | migration_mode, &contended_compaction, | 2763 | migration_mode, &contended_compaction, |
2729 | &deferred_compaction, | 2764 | &deferred_compaction); |
2730 | &did_some_progress); | ||
2731 | if (page) | 2765 | if (page) |
2732 | goto got_pg; | 2766 | goto got_pg; |
2733 | } | 2767 | } |
@@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2753 | struct zone *preferred_zone; | 2787 | struct zone *preferred_zone; |
2754 | struct zoneref *preferred_zoneref; | 2788 | struct zoneref *preferred_zoneref; |
2755 | struct page *page = NULL; | 2789 | struct page *page = NULL; |
2756 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2790 | int migratetype = gfpflags_to_migratetype(gfp_mask); |
2757 | unsigned int cpuset_mems_cookie; | 2791 | unsigned int cpuset_mems_cookie; |
2758 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2792 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2759 | int classzone_idx; | 2793 | int classzone_idx; |
@@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2775 | if (unlikely(!zonelist->_zonerefs->zone)) | 2809 | if (unlikely(!zonelist->_zonerefs->zone)) |
2776 | return NULL; | 2810 | return NULL; |
2777 | 2811 | ||
2812 | if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) | ||
2813 | alloc_flags |= ALLOC_CMA; | ||
2814 | |||
2778 | retry_cpuset: | 2815 | retry_cpuset: |
2779 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2816 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2780 | 2817 | ||
@@ -2786,10 +2823,6 @@ retry_cpuset: | |||
2786 | goto out; | 2823 | goto out; |
2787 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2824 | classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2788 | 2825 | ||
2789 | #ifdef CONFIG_CMA | ||
2790 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2791 | alloc_flags |= ALLOC_CMA; | ||
2792 | #endif | ||
2793 | /* First allocation attempt */ | 2826 | /* First allocation attempt */ |
2794 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2827 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2795 | zonelist, high_zoneidx, alloc_flags, | 2828 | zonelist, high_zoneidx, alloc_flags, |
@@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | |||
3579 | zonelist->_zonerefs[pos].zone_idx = 0; | 3612 | zonelist->_zonerefs[pos].zone_idx = 0; |
3580 | } | 3613 | } |
3581 | 3614 | ||
3615 | #if defined(CONFIG_64BIT) | ||
3616 | /* | ||
3617 | * Devices that require DMA32/DMA are relatively rare and do not justify a | ||
3618 | * penalty to every machine in case the specialised case applies. Default | ||
3619 | * to Node-ordering on 64-bit NUMA machines | ||
3620 | */ | ||
3621 | static int default_zonelist_order(void) | ||
3622 | { | ||
3623 | return ZONELIST_ORDER_NODE; | ||
3624 | } | ||
3625 | #else | ||
3626 | /* | ||
3627 | * On 32-bit, the Normal zone needs to be preserved for allocations accessible | ||
3628 | * by the kernel. If processes running on node 0 deplete the low memory zone | ||
3629 | * then reclaim will occur more frequency increasing stalls and potentially | ||
3630 | * be easier to OOM if a large percentage of the zone is under writeback or | ||
3631 | * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. | ||
3632 | * Hence, default to zone ordering on 32-bit. | ||
3633 | */ | ||
3582 | static int default_zonelist_order(void) | 3634 | static int default_zonelist_order(void) |
3583 | { | 3635 | { |
3584 | int nid, zone_type; | ||
3585 | unsigned long low_kmem_size, total_size; | ||
3586 | struct zone *z; | ||
3587 | int average_size; | ||
3588 | /* | ||
3589 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. | ||
3590 | * If they are really small and used heavily, the system can fall | ||
3591 | * into OOM very easily. | ||
3592 | * This function detect ZONE_DMA/DMA32 size and configures zone order. | ||
3593 | */ | ||
3594 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | ||
3595 | low_kmem_size = 0; | ||
3596 | total_size = 0; | ||
3597 | for_each_online_node(nid) { | ||
3598 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
3599 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
3600 | if (populated_zone(z)) { | ||
3601 | if (zone_type < ZONE_NORMAL) | ||
3602 | low_kmem_size += z->managed_pages; | ||
3603 | total_size += z->managed_pages; | ||
3604 | } else if (zone_type == ZONE_NORMAL) { | ||
3605 | /* | ||
3606 | * If any node has only lowmem, then node order | ||
3607 | * is preferred to allow kernel allocations | ||
3608 | * locally; otherwise, they can easily infringe | ||
3609 | * on other nodes when there is an abundance of | ||
3610 | * lowmem available to allocate from. | ||
3611 | */ | ||
3612 | return ZONELIST_ORDER_NODE; | ||
3613 | } | ||
3614 | } | ||
3615 | } | ||
3616 | if (!low_kmem_size || /* there are no DMA area. */ | ||
3617 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ | ||
3618 | return ZONELIST_ORDER_NODE; | ||
3619 | /* | ||
3620 | * look into each node's config. | ||
3621 | * If there is a node whose DMA/DMA32 memory is very big area on | ||
3622 | * local memory, NODE_ORDER may be suitable. | ||
3623 | */ | ||
3624 | average_size = total_size / | ||
3625 | (nodes_weight(node_states[N_MEMORY]) + 1); | ||
3626 | for_each_online_node(nid) { | ||
3627 | low_kmem_size = 0; | ||
3628 | total_size = 0; | ||
3629 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
3630 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
3631 | if (populated_zone(z)) { | ||
3632 | if (zone_type < ZONE_NORMAL) | ||
3633 | low_kmem_size += z->present_pages; | ||
3634 | total_size += z->present_pages; | ||
3635 | } | ||
3636 | } | ||
3637 | if (low_kmem_size && | ||
3638 | total_size > average_size && /* ignore small node */ | ||
3639 | low_kmem_size > total_size * 70/100) | ||
3640 | return ZONELIST_ORDER_NODE; | ||
3641 | } | ||
3642 | return ZONELIST_ORDER_ZONE; | 3636 | return ZONELIST_ORDER_ZONE; |
3643 | } | 3637 | } |
3638 | #endif /* CONFIG_64BIT */ | ||
3644 | 3639 | ||
3645 | static void set_zonelist_order(void) | 3640 | static void set_zonelist_order(void) |
3646 | { | 3641 | { |
@@ -5701,9 +5696,8 @@ static void __setup_per_zone_wmarks(void) | |||
5701 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5696 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5702 | 5697 | ||
5703 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, | 5698 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, |
5704 | high_wmark_pages(zone) - | 5699 | high_wmark_pages(zone) - low_wmark_pages(zone) - |
5705 | low_wmark_pages(zone) - | 5700 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); |
5706 | zone_page_state(zone, NR_ALLOC_BATCH)); | ||
5707 | 5701 | ||
5708 | setup_zone_migrate_reserve(zone); | 5702 | setup_zone_migrate_reserve(zone); |
5709 | spin_unlock_irqrestore(&zone->lock, flags); | 5703 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -6278,8 +6272,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
6278 | 6272 | ||
6279 | if (list_empty(&cc->migratepages)) { | 6273 | if (list_empty(&cc->migratepages)) { |
6280 | cc->nr_migratepages = 0; | 6274 | cc->nr_migratepages = 0; |
6281 | pfn = isolate_migratepages_range(cc->zone, cc, | 6275 | pfn = isolate_migratepages_range(cc, pfn, end); |
6282 | pfn, end, true); | ||
6283 | if (!pfn) { | 6276 | if (!pfn) { |
6284 | ret = -EINTR; | 6277 | ret = -EINTR; |
6285 | break; | 6278 | break; |
@@ -6555,97 +6548,3 @@ bool is_free_buddy_page(struct page *page) | |||
6555 | return order < MAX_ORDER; | 6548 | return order < MAX_ORDER; |
6556 | } | 6549 | } |
6557 | #endif | 6550 | #endif |
6558 | |||
6559 | static const struct trace_print_flags pageflag_names[] = { | ||
6560 | {1UL << PG_locked, "locked" }, | ||
6561 | {1UL << PG_error, "error" }, | ||
6562 | {1UL << PG_referenced, "referenced" }, | ||
6563 | {1UL << PG_uptodate, "uptodate" }, | ||
6564 | {1UL << PG_dirty, "dirty" }, | ||
6565 | {1UL << PG_lru, "lru" }, | ||
6566 | {1UL << PG_active, "active" }, | ||
6567 | {1UL << PG_slab, "slab" }, | ||
6568 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
6569 | {1UL << PG_arch_1, "arch_1" }, | ||
6570 | {1UL << PG_reserved, "reserved" }, | ||
6571 | {1UL << PG_private, "private" }, | ||
6572 | {1UL << PG_private_2, "private_2" }, | ||
6573 | {1UL << PG_writeback, "writeback" }, | ||
6574 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
6575 | {1UL << PG_head, "head" }, | ||
6576 | {1UL << PG_tail, "tail" }, | ||
6577 | #else | ||
6578 | {1UL << PG_compound, "compound" }, | ||
6579 | #endif | ||
6580 | {1UL << PG_swapcache, "swapcache" }, | ||
6581 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
6582 | {1UL << PG_reclaim, "reclaim" }, | ||
6583 | {1UL << PG_swapbacked, "swapbacked" }, | ||
6584 | {1UL << PG_unevictable, "unevictable" }, | ||
6585 | #ifdef CONFIG_MMU | ||
6586 | {1UL << PG_mlocked, "mlocked" }, | ||
6587 | #endif | ||
6588 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
6589 | {1UL << PG_uncached, "uncached" }, | ||
6590 | #endif | ||
6591 | #ifdef CONFIG_MEMORY_FAILURE | ||
6592 | {1UL << PG_hwpoison, "hwpoison" }, | ||
6593 | #endif | ||
6594 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
6595 | {1UL << PG_compound_lock, "compound_lock" }, | ||
6596 | #endif | ||
6597 | }; | ||
6598 | |||
6599 | static void dump_page_flags(unsigned long flags) | ||
6600 | { | ||
6601 | const char *delim = ""; | ||
6602 | unsigned long mask; | ||
6603 | int i; | ||
6604 | |||
6605 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
6606 | |||
6607 | printk(KERN_ALERT "page flags: %#lx(", flags); | ||
6608 | |||
6609 | /* remove zone id */ | ||
6610 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
6611 | |||
6612 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { | ||
6613 | |||
6614 | mask = pageflag_names[i].mask; | ||
6615 | if ((flags & mask) != mask) | ||
6616 | continue; | ||
6617 | |||
6618 | flags &= ~mask; | ||
6619 | printk("%s%s", delim, pageflag_names[i].name); | ||
6620 | delim = "|"; | ||
6621 | } | ||
6622 | |||
6623 | /* check for left over flags */ | ||
6624 | if (flags) | ||
6625 | printk("%s%#lx", delim, flags); | ||
6626 | |||
6627 | printk(")\n"); | ||
6628 | } | ||
6629 | |||
6630 | void dump_page_badflags(struct page *page, const char *reason, | ||
6631 | unsigned long badflags) | ||
6632 | { | ||
6633 | printk(KERN_ALERT | ||
6634 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
6635 | page, atomic_read(&page->_count), page_mapcount(page), | ||
6636 | page->mapping, page->index); | ||
6637 | dump_page_flags(page->flags); | ||
6638 | if (reason) | ||
6639 | pr_alert("page dumped because: %s\n", reason); | ||
6640 | if (page->flags & badflags) { | ||
6641 | pr_alert("bad because of flags:\n"); | ||
6642 | dump_page_flags(page->flags & badflags); | ||
6643 | } | ||
6644 | mem_cgroup_print_bad_page(page); | ||
6645 | } | ||
6646 | |||
6647 | void dump_page(struct page *page, const char *reason) | ||
6648 | { | ||
6649 | dump_page_badflags(page, reason, 0); | ||
6650 | } | ||
6651 | EXPORT_SYMBOL(dump_page); | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2beeabf502c5..ad83195521f2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
177 | if (!walk->mm) | 177 | if (!walk->mm) |
178 | return -EINVAL; | 178 | return -EINVAL; |
179 | 179 | ||
180 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | 180 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); |
181 | 181 | ||
182 | pgd = pgd_offset(walk->mm, addr); | 182 | pgd = pgd_offset(walk->mm, addr); |
183 | do { | 183 | do { |
diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fefc6a2..10e3d0b8a86d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c | |||
@@ -33,17 +33,14 @@ | |||
33 | 33 | ||
34 | #include <linux/log2.h> | 34 | #include <linux/log2.h> |
35 | 35 | ||
36 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | 36 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, |
37 | int page_start, int page_end) | ||
37 | { | 38 | { |
38 | unsigned int cpu; | ||
39 | |||
40 | for_each_possible_cpu(cpu) | ||
41 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | ||
42 | |||
43 | return 0; | 39 | return 0; |
44 | } | 40 | } |
45 | 41 | ||
46 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | 42 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, |
43 | int page_start, int page_end) | ||
47 | { | 44 | { |
48 | /* nada */ | 45 | /* nada */ |
49 | } | 46 | } |
@@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void) | |||
70 | 67 | ||
71 | chunk->data = pages; | 68 | chunk->data = pages; |
72 | chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; | 69 | chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; |
70 | |||
71 | spin_lock_irq(&pcpu_lock); | ||
72 | pcpu_chunk_populated(chunk, 0, nr_pages); | ||
73 | spin_unlock_irq(&pcpu_lock); | ||
74 | |||
73 | return chunk; | 75 | return chunk; |
74 | } | 76 | } |
75 | 77 | ||
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 3707c71ae4cd..538998a137d2 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, | |||
20 | } | 20 | } |
21 | 21 | ||
22 | /** | 22 | /** |
23 | * pcpu_get_pages_and_bitmap - get temp pages array and bitmap | 23 | * pcpu_get_pages - get temp pages array |
24 | * @chunk: chunk of interest | 24 | * @chunk: chunk of interest |
25 | * @bitmapp: output parameter for bitmap | ||
26 | * @may_alloc: may allocate the array | ||
27 | * | 25 | * |
28 | * Returns pointer to array of pointers to struct page and bitmap, | 26 | * Returns pointer to array of pointers to struct page which can be indexed |
29 | * both of which can be indexed with pcpu_page_idx(). The returned | 27 | * with pcpu_page_idx(). Note that there is only one array and accesses |
30 | * array is cleared to zero and *@bitmapp is copied from | 28 | * should be serialized by pcpu_alloc_mutex. |
31 | * @chunk->populated. Note that there is only one array and bitmap | ||
32 | * and access exclusion is the caller's responsibility. | ||
33 | * | ||
34 | * CONTEXT: | ||
35 | * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. | ||
36 | * Otherwise, don't care. | ||
37 | * | 29 | * |
38 | * RETURNS: | 30 | * RETURNS: |
39 | * Pointer to temp pages array on success, NULL on failure. | 31 | * Pointer to temp pages array on success. |
40 | */ | 32 | */ |
41 | static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | 33 | static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) |
42 | unsigned long **bitmapp, | ||
43 | bool may_alloc) | ||
44 | { | 34 | { |
45 | static struct page **pages; | 35 | static struct page **pages; |
46 | static unsigned long *bitmap; | ||
47 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); | 36 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); |
48 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * | ||
49 | sizeof(unsigned long); | ||
50 | |||
51 | if (!pages || !bitmap) { | ||
52 | if (may_alloc && !pages) | ||
53 | pages = pcpu_mem_zalloc(pages_size); | ||
54 | if (may_alloc && !bitmap) | ||
55 | bitmap = pcpu_mem_zalloc(bitmap_size); | ||
56 | if (!pages || !bitmap) | ||
57 | return NULL; | ||
58 | } | ||
59 | 37 | ||
60 | bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); | 38 | lockdep_assert_held(&pcpu_alloc_mutex); |
61 | 39 | ||
62 | *bitmapp = bitmap; | 40 | if (!pages) |
41 | pages = pcpu_mem_zalloc(pages_size); | ||
63 | return pages; | 42 | return pages; |
64 | } | 43 | } |
65 | 44 | ||
@@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | |||
67 | * pcpu_free_pages - free pages which were allocated for @chunk | 46 | * pcpu_free_pages - free pages which were allocated for @chunk |
68 | * @chunk: chunk pages were allocated for | 47 | * @chunk: chunk pages were allocated for |
69 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() | 48 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() |
70 | * @populated: populated bitmap | ||
71 | * @page_start: page index of the first page to be freed | 49 | * @page_start: page index of the first page to be freed |
72 | * @page_end: page index of the last page to be freed + 1 | 50 | * @page_end: page index of the last page to be freed + 1 |
73 | * | 51 | * |
@@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | |||
75 | * The pages were allocated for @chunk. | 53 | * The pages were allocated for @chunk. |
76 | */ | 54 | */ |
77 | static void pcpu_free_pages(struct pcpu_chunk *chunk, | 55 | static void pcpu_free_pages(struct pcpu_chunk *chunk, |
78 | struct page **pages, unsigned long *populated, | 56 | struct page **pages, int page_start, int page_end) |
79 | int page_start, int page_end) | ||
80 | { | 57 | { |
81 | unsigned int cpu; | 58 | unsigned int cpu; |
82 | int i; | 59 | int i; |
@@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, | |||
95 | * pcpu_alloc_pages - allocates pages for @chunk | 72 | * pcpu_alloc_pages - allocates pages for @chunk |
96 | * @chunk: target chunk | 73 | * @chunk: target chunk |
97 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() | 74 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() |
98 | * @populated: populated bitmap | ||
99 | * @page_start: page index of the first page to be allocated | 75 | * @page_start: page index of the first page to be allocated |
100 | * @page_end: page index of the last page to be allocated + 1 | 76 | * @page_end: page index of the last page to be allocated + 1 |
101 | * | 77 | * |
@@ -104,11 +80,10 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, | |||
104 | * content of @pages and will pass it verbatim to pcpu_map_pages(). | 80 | * content of @pages and will pass it verbatim to pcpu_map_pages(). |
105 | */ | 81 | */ |
106 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | 82 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
107 | struct page **pages, unsigned long *populated, | 83 | struct page **pages, int page_start, int page_end) |
108 | int page_start, int page_end) | ||
109 | { | 84 | { |
110 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; | 85 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; |
111 | unsigned int cpu; | 86 | unsigned int cpu, tcpu; |
112 | int i; | 87 | int i; |
113 | 88 | ||
114 | for_each_possible_cpu(cpu) { | 89 | for_each_possible_cpu(cpu) { |
@@ -116,14 +91,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | |||
116 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; | 91 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; |
117 | 92 | ||
118 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); | 93 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); |
119 | if (!*pagep) { | 94 | if (!*pagep) |
120 | pcpu_free_pages(chunk, pages, populated, | 95 | goto err; |
121 | page_start, page_end); | ||
122 | return -ENOMEM; | ||
123 | } | ||
124 | } | 96 | } |
125 | } | 97 | } |
126 | return 0; | 98 | return 0; |
99 | |||
100 | err: | ||
101 | while (--i >= page_start) | ||
102 | __free_page(pages[pcpu_page_idx(cpu, i)]); | ||
103 | |||
104 | for_each_possible_cpu(tcpu) { | ||
105 | if (tcpu == cpu) | ||
106 | break; | ||
107 | for (i = page_start; i < page_end; i++) | ||
108 | __free_page(pages[pcpu_page_idx(tcpu, i)]); | ||
109 | } | ||
110 | return -ENOMEM; | ||
127 | } | 111 | } |
128 | 112 | ||
129 | /** | 113 | /** |
@@ -155,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | |||
155 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk | 139 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk |
156 | * @chunk: chunk of interest | 140 | * @chunk: chunk of interest |
157 | * @pages: pages array which can be used to pass information to free | 141 | * @pages: pages array which can be used to pass information to free |
158 | * @populated: populated bitmap | ||
159 | * @page_start: page index of the first page to unmap | 142 | * @page_start: page index of the first page to unmap |
160 | * @page_end: page index of the last page to unmap + 1 | 143 | * @page_end: page index of the last page to unmap + 1 |
161 | * | 144 | * |
@@ -166,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | |||
166 | * proper pre/post flush functions. | 149 | * proper pre/post flush functions. |
167 | */ | 150 | */ |
168 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | 151 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, |
169 | struct page **pages, unsigned long *populated, | 152 | struct page **pages, int page_start, int page_end) |
170 | int page_start, int page_end) | ||
171 | { | 153 | { |
172 | unsigned int cpu; | 154 | unsigned int cpu; |
173 | int i; | 155 | int i; |
@@ -183,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | |||
183 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), | 165 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
184 | page_end - page_start); | 166 | page_end - page_start); |
185 | } | 167 | } |
186 | |||
187 | bitmap_clear(populated, page_start, page_end - page_start); | ||
188 | } | 168 | } |
189 | 169 | ||
190 | /** | 170 | /** |
@@ -219,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, | |||
219 | * pcpu_map_pages - map pages into a pcpu_chunk | 199 | * pcpu_map_pages - map pages into a pcpu_chunk |
220 | * @chunk: chunk of interest | 200 | * @chunk: chunk of interest |
221 | * @pages: pages array containing pages to be mapped | 201 | * @pages: pages array containing pages to be mapped |
222 | * @populated: populated bitmap | ||
223 | * @page_start: page index of the first page to map | 202 | * @page_start: page index of the first page to map |
224 | * @page_end: page index of the last page to map + 1 | 203 | * @page_end: page index of the last page to map + 1 |
225 | * | 204 | * |
@@ -227,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, | |||
227 | * caller is responsible for calling pcpu_post_map_flush() after all | 206 | * caller is responsible for calling pcpu_post_map_flush() after all |
228 | * mappings are complete. | 207 | * mappings are complete. |
229 | * | 208 | * |
230 | * This function is responsible for setting corresponding bits in | 209 | * This function is responsible for setting up whatever is necessary for |
231 | * @chunk->populated bitmap and whatever is necessary for reverse | 210 | * reverse lookup (addr -> chunk). |
232 | * lookup (addr -> chunk). | ||
233 | */ | 211 | */ |
234 | static int pcpu_map_pages(struct pcpu_chunk *chunk, | 212 | static int pcpu_map_pages(struct pcpu_chunk *chunk, |
235 | struct page **pages, unsigned long *populated, | 213 | struct page **pages, int page_start, int page_end) |
236 | int page_start, int page_end) | ||
237 | { | 214 | { |
238 | unsigned int cpu, tcpu; | 215 | unsigned int cpu, tcpu; |
239 | int i, err; | 216 | int i, err; |
@@ -244,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, | |||
244 | page_end - page_start); | 221 | page_end - page_start); |
245 | if (err < 0) | 222 | if (err < 0) |
246 | goto err; | 223 | goto err; |
247 | } | ||
248 | 224 | ||
249 | /* mapping successful, link chunk and mark populated */ | 225 | for (i = page_start; i < page_end; i++) |
250 | for (i = page_start; i < page_end; i++) { | ||
251 | for_each_possible_cpu(cpu) | ||
252 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], | 226 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], |
253 | chunk); | 227 | chunk); |
254 | __set_bit(i, populated); | ||
255 | } | 228 | } |
256 | |||
257 | return 0; | 229 | return 0; |
258 | |||
259 | err: | 230 | err: |
260 | for_each_possible_cpu(tcpu) { | 231 | for_each_possible_cpu(tcpu) { |
261 | if (tcpu == cpu) | 232 | if (tcpu == cpu) |
@@ -263,6 +234,7 @@ err: | |||
263 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), | 234 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), |
264 | page_end - page_start); | 235 | page_end - page_start); |
265 | } | 236 | } |
237 | pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); | ||
266 | return err; | 238 | return err; |
267 | } | 239 | } |
268 | 240 | ||
@@ -289,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | |||
289 | /** | 261 | /** |
290 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk | 262 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk |
291 | * @chunk: chunk of interest | 263 | * @chunk: chunk of interest |
292 | * @off: offset to the area to populate | 264 | * @page_start: the start page |
293 | * @size: size of the area to populate in bytes | 265 | * @page_end: the end page |
294 | * | 266 | * |
295 | * For each cpu, populate and map pages [@page_start,@page_end) into | 267 | * For each cpu, populate and map pages [@page_start,@page_end) into |
296 | * @chunk. The area is cleared on return. | 268 | * @chunk. |
297 | * | 269 | * |
298 | * CONTEXT: | 270 | * CONTEXT: |
299 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. | 271 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. |
300 | */ | 272 | */ |
301 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | 273 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, |
274 | int page_start, int page_end) | ||
302 | { | 275 | { |
303 | int page_start = PFN_DOWN(off); | ||
304 | int page_end = PFN_UP(off + size); | ||
305 | int free_end = page_start, unmap_end = page_start; | ||
306 | struct page **pages; | 276 | struct page **pages; |
307 | unsigned long *populated; | ||
308 | unsigned int cpu; | ||
309 | int rs, re, rc; | ||
310 | |||
311 | /* quick path, check whether all pages are already there */ | ||
312 | rs = page_start; | ||
313 | pcpu_next_pop(chunk, &rs, &re, page_end); | ||
314 | if (rs == page_start && re == page_end) | ||
315 | goto clear; | ||
316 | 277 | ||
317 | /* need to allocate and map pages, this chunk can't be immutable */ | 278 | pages = pcpu_get_pages(chunk); |
318 | WARN_ON(chunk->immutable); | ||
319 | |||
320 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); | ||
321 | if (!pages) | 279 | if (!pages) |
322 | return -ENOMEM; | 280 | return -ENOMEM; |
323 | 281 | ||
324 | /* alloc and map */ | 282 | if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) |
325 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 283 | return -ENOMEM; |
326 | rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); | ||
327 | if (rc) | ||
328 | goto err_free; | ||
329 | free_end = re; | ||
330 | } | ||
331 | 284 | ||
332 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 285 | if (pcpu_map_pages(chunk, pages, page_start, page_end)) { |
333 | rc = pcpu_map_pages(chunk, pages, populated, rs, re); | 286 | pcpu_free_pages(chunk, pages, page_start, page_end); |
334 | if (rc) | 287 | return -ENOMEM; |
335 | goto err_unmap; | ||
336 | unmap_end = re; | ||
337 | } | 288 | } |
338 | pcpu_post_map_flush(chunk, page_start, page_end); | 289 | pcpu_post_map_flush(chunk, page_start, page_end); |
339 | 290 | ||
340 | /* commit new bitmap */ | ||
341 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
342 | clear: | ||
343 | for_each_possible_cpu(cpu) | ||
344 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | ||
345 | return 0; | 291 | return 0; |
346 | |||
347 | err_unmap: | ||
348 | pcpu_pre_unmap_flush(chunk, page_start, unmap_end); | ||
349 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) | ||
350 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
351 | pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); | ||
352 | err_free: | ||
353 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) | ||
354 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
355 | return rc; | ||
356 | } | 292 | } |
357 | 293 | ||
358 | /** | 294 | /** |
359 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk | 295 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk |
360 | * @chunk: chunk to depopulate | 296 | * @chunk: chunk to depopulate |
361 | * @off: offset to the area to depopulate | 297 | * @page_start: the start page |
362 | * @size: size of the area to depopulate in bytes | 298 | * @page_end: the end page |
363 | * | 299 | * |
364 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 300 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
365 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 301 | * from @chunk. |
366 | * and tlb after. | ||
367 | * | 302 | * |
368 | * CONTEXT: | 303 | * CONTEXT: |
369 | * pcpu_alloc_mutex. | 304 | * pcpu_alloc_mutex. |
370 | */ | 305 | */ |
371 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | 306 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, |
307 | int page_start, int page_end) | ||
372 | { | 308 | { |
373 | int page_start = PFN_DOWN(off); | ||
374 | int page_end = PFN_UP(off + size); | ||
375 | struct page **pages; | 309 | struct page **pages; |
376 | unsigned long *populated; | ||
377 | int rs, re; | ||
378 | |||
379 | /* quick path, check whether it's empty already */ | ||
380 | rs = page_start; | ||
381 | pcpu_next_unpop(chunk, &rs, &re, page_end); | ||
382 | if (rs == page_start && re == page_end) | ||
383 | return; | ||
384 | |||
385 | /* immutable chunks can't be depopulated */ | ||
386 | WARN_ON(chunk->immutable); | ||
387 | 310 | ||
388 | /* | 311 | /* |
389 | * If control reaches here, there must have been at least one | 312 | * If control reaches here, there must have been at least one |
390 | * successful population attempt so the temp pages array must | 313 | * successful population attempt so the temp pages array must |
391 | * be available now. | 314 | * be available now. |
392 | */ | 315 | */ |
393 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); | 316 | pages = pcpu_get_pages(chunk); |
394 | BUG_ON(!pages); | 317 | BUG_ON(!pages); |
395 | 318 | ||
396 | /* unmap and free */ | 319 | /* unmap and free */ |
397 | pcpu_pre_unmap_flush(chunk, page_start, page_end); | 320 | pcpu_pre_unmap_flush(chunk, page_start, page_end); |
398 | 321 | ||
399 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | 322 | pcpu_unmap_pages(chunk, pages, page_start, page_end); |
400 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
401 | 323 | ||
402 | /* no need to flush tlb, vmalloc will handle it lazily */ | 324 | /* no need to flush tlb, vmalloc will handle it lazily */ |
403 | 325 | ||
404 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | 326 | pcpu_free_pages(chunk, pages, page_start, page_end); |
405 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
406 | |||
407 | /* commit new bitmap */ | ||
408 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
409 | } | 327 | } |
410 | 328 | ||
411 | static struct pcpu_chunk *pcpu_create_chunk(void) | 329 | static struct pcpu_chunk *pcpu_create_chunk(void) |
diff --git a/mm/percpu.c b/mm/percpu.c index 2139e30a4b44..014bab65e0ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -76,6 +76,10 @@ | |||
76 | 76 | ||
77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ | 77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ |
78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ | 78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ |
79 | #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 | ||
80 | #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 | ||
81 | #define PCPU_EMPTY_POP_PAGES_LOW 2 | ||
82 | #define PCPU_EMPTY_POP_PAGES_HIGH 4 | ||
79 | 83 | ||
80 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
81 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ | 85 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ |
@@ -102,12 +106,16 @@ struct pcpu_chunk { | |||
102 | int free_size; /* free bytes in the chunk */ | 106 | int free_size; /* free bytes in the chunk */ |
103 | int contig_hint; /* max contiguous size hint */ | 107 | int contig_hint; /* max contiguous size hint */ |
104 | void *base_addr; /* base address of this chunk */ | 108 | void *base_addr; /* base address of this chunk */ |
109 | |||
105 | int map_used; /* # of map entries used before the sentry */ | 110 | int map_used; /* # of map entries used before the sentry */ |
106 | int map_alloc; /* # of map entries allocated */ | 111 | int map_alloc; /* # of map entries allocated */ |
107 | int *map; /* allocation map */ | 112 | int *map; /* allocation map */ |
113 | struct work_struct map_extend_work;/* async ->map[] extension */ | ||
114 | |||
108 | void *data; /* chunk data */ | 115 | void *data; /* chunk data */ |
109 | int first_free; /* no free below this */ | 116 | int first_free; /* no free below this */ |
110 | bool immutable; /* no [de]population allowed */ | 117 | bool immutable; /* no [de]population allowed */ |
118 | int nr_populated; /* # of populated pages */ | ||
111 | unsigned long populated[]; /* populated bitmap */ | 119 | unsigned long populated[]; /* populated bitmap */ |
112 | }; | 120 | }; |
113 | 121 | ||
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk; | |||
151 | static struct pcpu_chunk *pcpu_reserved_chunk; | 159 | static struct pcpu_chunk *pcpu_reserved_chunk; |
152 | static int pcpu_reserved_chunk_limit; | 160 | static int pcpu_reserved_chunk_limit; |
153 | 161 | ||
162 | static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ | ||
163 | static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ | ||
164 | |||
165 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | ||
166 | |||
154 | /* | 167 | /* |
155 | * Synchronization rules. | 168 | * The number of empty populated pages, protected by pcpu_lock. The |
156 | * | 169 | * reserved chunk doesn't contribute to the count. |
157 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former | ||
158 | * protects allocation/reclaim paths, chunks, populated bitmap and | ||
159 | * vmalloc mapping. The latter is a spinlock and protects the index | ||
160 | * data structures - chunk slots, chunks and area maps in chunks. | ||
161 | * | ||
162 | * During allocation, pcpu_alloc_mutex is kept locked all the time and | ||
163 | * pcpu_lock is grabbed and released as necessary. All actual memory | ||
164 | * allocations are done using GFP_KERNEL with pcpu_lock released. In | ||
165 | * general, percpu memory can't be allocated with irq off but | ||
166 | * irqsave/restore are still used in alloc path so that it can be used | ||
167 | * from early init path - sched_init() specifically. | ||
168 | * | ||
169 | * Free path accesses and alters only the index data structures, so it | ||
170 | * can be safely called from atomic context. When memory needs to be | ||
171 | * returned to the system, free path schedules reclaim_work which | ||
172 | * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be | ||
173 | * reclaimed, release both locks and frees the chunks. Note that it's | ||
174 | * necessary to grab both locks to remove a chunk from circulation as | ||
175 | * allocation path might be referencing the chunk with only | ||
176 | * pcpu_alloc_mutex locked. | ||
177 | */ | 170 | */ |
178 | static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ | 171 | static int pcpu_nr_empty_pop_pages; |
179 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ | ||
180 | 172 | ||
181 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | 173 | /* |
174 | * Balance work is used to populate or destroy chunks asynchronously. We | ||
175 | * try to keep the number of populated free pages between | ||
176 | * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one | ||
177 | * empty chunk. | ||
178 | */ | ||
179 | static void pcpu_balance_workfn(struct work_struct *work); | ||
180 | static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); | ||
181 | static bool pcpu_async_enabled __read_mostly; | ||
182 | static bool pcpu_atomic_alloc_failed; | ||
182 | 183 | ||
183 | /* reclaim work to release fully free chunks, scheduled from free path */ | 184 | static void pcpu_schedule_balance_work(void) |
184 | static void pcpu_reclaim(struct work_struct *work); | 185 | { |
185 | static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); | 186 | if (pcpu_async_enabled) |
187 | schedule_work(&pcpu_balance_work); | ||
188 | } | ||
186 | 189 | ||
187 | static bool pcpu_addr_in_first_chunk(void *addr) | 190 | static bool pcpu_addr_in_first_chunk(void *addr) |
188 | { | 191 | { |
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size) | |||
315 | } | 318 | } |
316 | 319 | ||
317 | /** | 320 | /** |
321 | * pcpu_count_occupied_pages - count the number of pages an area occupies | ||
322 | * @chunk: chunk of interest | ||
323 | * @i: index of the area in question | ||
324 | * | ||
325 | * Count the number of pages chunk's @i'th area occupies. When the area's | ||
326 | * start and/or end address isn't aligned to page boundary, the straddled | ||
327 | * page is included in the count iff the rest of the page is free. | ||
328 | */ | ||
329 | static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) | ||
330 | { | ||
331 | int off = chunk->map[i] & ~1; | ||
332 | int end = chunk->map[i + 1] & ~1; | ||
333 | |||
334 | if (!PAGE_ALIGNED(off) && i > 0) { | ||
335 | int prev = chunk->map[i - 1]; | ||
336 | |||
337 | if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) | ||
338 | off = round_down(off, PAGE_SIZE); | ||
339 | } | ||
340 | |||
341 | if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { | ||
342 | int next = chunk->map[i + 1]; | ||
343 | int nend = chunk->map[i + 2] & ~1; | ||
344 | |||
345 | if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) | ||
346 | end = round_up(end, PAGE_SIZE); | ||
347 | } | ||
348 | |||
349 | return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); | ||
350 | } | ||
351 | |||
352 | /** | ||
318 | * pcpu_chunk_relocate - put chunk in the appropriate chunk slot | 353 | * pcpu_chunk_relocate - put chunk in the appropriate chunk slot |
319 | * @chunk: chunk of interest | 354 | * @chunk: chunk of interest |
320 | * @oslot: the previous slot it was on | 355 | * @oslot: the previous slot it was on |
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
342 | /** | 377 | /** |
343 | * pcpu_need_to_extend - determine whether chunk area map needs to be extended | 378 | * pcpu_need_to_extend - determine whether chunk area map needs to be extended |
344 | * @chunk: chunk of interest | 379 | * @chunk: chunk of interest |
380 | * @is_atomic: the allocation context | ||
345 | * | 381 | * |
346 | * Determine whether area map of @chunk needs to be extended to | 382 | * Determine whether area map of @chunk needs to be extended. If |
347 | * accommodate a new allocation. | 383 | * @is_atomic, only the amount necessary for a new allocation is |
384 | * considered; however, async extension is scheduled if the left amount is | ||
385 | * low. If !@is_atomic, it aims for more empty space. Combined, this | ||
386 | * ensures that the map is likely to have enough available space to | ||
387 | * accomodate atomic allocations which can't extend maps directly. | ||
348 | * | 388 | * |
349 | * CONTEXT: | 389 | * CONTEXT: |
350 | * pcpu_lock. | 390 | * pcpu_lock. |
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
353 | * New target map allocation length if extension is necessary, 0 | 393 | * New target map allocation length if extension is necessary, 0 |
354 | * otherwise. | 394 | * otherwise. |
355 | */ | 395 | */ |
356 | static int pcpu_need_to_extend(struct pcpu_chunk *chunk) | 396 | static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) |
357 | { | 397 | { |
358 | int new_alloc; | 398 | int margin, new_alloc; |
399 | |||
400 | if (is_atomic) { | ||
401 | margin = 3; | ||
402 | |||
403 | if (chunk->map_alloc < | ||
404 | chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && | ||
405 | pcpu_async_enabled) | ||
406 | schedule_work(&chunk->map_extend_work); | ||
407 | } else { | ||
408 | margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; | ||
409 | } | ||
359 | 410 | ||
360 | if (chunk->map_alloc >= chunk->map_used + 3) | 411 | if (chunk->map_alloc >= chunk->map_used + margin) |
361 | return 0; | 412 | return 0; |
362 | 413 | ||
363 | new_alloc = PCPU_DFL_MAP_ALLOC; | 414 | new_alloc = PCPU_DFL_MAP_ALLOC; |
364 | while (new_alloc < chunk->map_used + 3) | 415 | while (new_alloc < chunk->map_used + margin) |
365 | new_alloc *= 2; | 416 | new_alloc *= 2; |
366 | 417 | ||
367 | return new_alloc; | 418 | return new_alloc; |
@@ -418,11 +469,76 @@ out_unlock: | |||
418 | return 0; | 469 | return 0; |
419 | } | 470 | } |
420 | 471 | ||
472 | static void pcpu_map_extend_workfn(struct work_struct *work) | ||
473 | { | ||
474 | struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, | ||
475 | map_extend_work); | ||
476 | int new_alloc; | ||
477 | |||
478 | spin_lock_irq(&pcpu_lock); | ||
479 | new_alloc = pcpu_need_to_extend(chunk, false); | ||
480 | spin_unlock_irq(&pcpu_lock); | ||
481 | |||
482 | if (new_alloc) | ||
483 | pcpu_extend_area_map(chunk, new_alloc); | ||
484 | } | ||
485 | |||
486 | /** | ||
487 | * pcpu_fit_in_area - try to fit the requested allocation in a candidate area | ||
488 | * @chunk: chunk the candidate area belongs to | ||
489 | * @off: the offset to the start of the candidate area | ||
490 | * @this_size: the size of the candidate area | ||
491 | * @size: the size of the target allocation | ||
492 | * @align: the alignment of the target allocation | ||
493 | * @pop_only: only allocate from already populated region | ||
494 | * | ||
495 | * We're trying to allocate @size bytes aligned at @align. @chunk's area | ||
496 | * at @off sized @this_size is a candidate. This function determines | ||
497 | * whether the target allocation fits in the candidate area and returns the | ||
498 | * number of bytes to pad after @off. If the target area doesn't fit, -1 | ||
499 | * is returned. | ||
500 | * | ||
501 | * If @pop_only is %true, this function only considers the already | ||
502 | * populated part of the candidate area. | ||
503 | */ | ||
504 | static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, | ||
505 | int size, int align, bool pop_only) | ||
506 | { | ||
507 | int cand_off = off; | ||
508 | |||
509 | while (true) { | ||
510 | int head = ALIGN(cand_off, align) - off; | ||
511 | int page_start, page_end, rs, re; | ||
512 | |||
513 | if (this_size < head + size) | ||
514 | return -1; | ||
515 | |||
516 | if (!pop_only) | ||
517 | return head; | ||
518 | |||
519 | /* | ||
520 | * If the first unpopulated page is beyond the end of the | ||
521 | * allocation, the whole allocation is populated; | ||
522 | * otherwise, retry from the end of the unpopulated area. | ||
523 | */ | ||
524 | page_start = PFN_DOWN(head + off); | ||
525 | page_end = PFN_UP(head + off + size); | ||
526 | |||
527 | rs = page_start; | ||
528 | pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); | ||
529 | if (rs >= page_end) | ||
530 | return head; | ||
531 | cand_off = re * PAGE_SIZE; | ||
532 | } | ||
533 | } | ||
534 | |||
421 | /** | 535 | /** |
422 | * pcpu_alloc_area - allocate area from a pcpu_chunk | 536 | * pcpu_alloc_area - allocate area from a pcpu_chunk |
423 | * @chunk: chunk of interest | 537 | * @chunk: chunk of interest |
424 | * @size: wanted size in bytes | 538 | * @size: wanted size in bytes |
425 | * @align: wanted align | 539 | * @align: wanted align |
540 | * @pop_only: allocate only from the populated area | ||
541 | * @occ_pages_p: out param for the number of pages the area occupies | ||
426 | * | 542 | * |
427 | * Try to allocate @size bytes area aligned at @align from @chunk. | 543 | * Try to allocate @size bytes area aligned at @align from @chunk. |
428 | * Note that this function only allocates the offset. It doesn't | 544 | * Note that this function only allocates the offset. It doesn't |
@@ -437,7 +553,8 @@ out_unlock: | |||
437 | * Allocated offset in @chunk on success, -1 if no matching area is | 553 | * Allocated offset in @chunk on success, -1 if no matching area is |
438 | * found. | 554 | * found. |
439 | */ | 555 | */ |
440 | static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | 556 | static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, |
557 | bool pop_only, int *occ_pages_p) | ||
441 | { | 558 | { |
442 | int oslot = pcpu_chunk_slot(chunk); | 559 | int oslot = pcpu_chunk_slot(chunk); |
443 | int max_contig = 0; | 560 | int max_contig = 0; |
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
453 | if (off & 1) | 570 | if (off & 1) |
454 | continue; | 571 | continue; |
455 | 572 | ||
456 | /* extra for alignment requirement */ | ||
457 | head = ALIGN(off, align) - off; | ||
458 | |||
459 | this_size = (p[1] & ~1) - off; | 573 | this_size = (p[1] & ~1) - off; |
460 | if (this_size < head + size) { | 574 | |
575 | head = pcpu_fit_in_area(chunk, off, this_size, size, align, | ||
576 | pop_only); | ||
577 | if (head < 0) { | ||
461 | if (!seen_free) { | 578 | if (!seen_free) { |
462 | chunk->first_free = i; | 579 | chunk->first_free = i; |
463 | seen_free = true; | 580 | seen_free = true; |
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
526 | chunk->free_size -= size; | 643 | chunk->free_size -= size; |
527 | *p |= 1; | 644 | *p |= 1; |
528 | 645 | ||
646 | *occ_pages_p = pcpu_count_occupied_pages(chunk, i); | ||
529 | pcpu_chunk_relocate(chunk, oslot); | 647 | pcpu_chunk_relocate(chunk, oslot); |
530 | return off; | 648 | return off; |
531 | } | 649 | } |
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
541 | * pcpu_free_area - free area to a pcpu_chunk | 659 | * pcpu_free_area - free area to a pcpu_chunk |
542 | * @chunk: chunk of interest | 660 | * @chunk: chunk of interest |
543 | * @freeme: offset of area to free | 661 | * @freeme: offset of area to free |
662 | * @occ_pages_p: out param for the number of pages the area occupies | ||
544 | * | 663 | * |
545 | * Free area starting from @freeme to @chunk. Note that this function | 664 | * Free area starting from @freeme to @chunk. Note that this function |
546 | * only modifies the allocation map. It doesn't depopulate or unmap | 665 | * only modifies the allocation map. It doesn't depopulate or unmap |
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
549 | * CONTEXT: | 668 | * CONTEXT: |
550 | * pcpu_lock. | 669 | * pcpu_lock. |
551 | */ | 670 | */ |
552 | static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) | 671 | static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, |
672 | int *occ_pages_p) | ||
553 | { | 673 | { |
554 | int oslot = pcpu_chunk_slot(chunk); | 674 | int oslot = pcpu_chunk_slot(chunk); |
555 | int off = 0; | 675 | int off = 0; |
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) | |||
580 | *p = off &= ~1; | 700 | *p = off &= ~1; |
581 | chunk->free_size += (p[1] & ~1) - off; | 701 | chunk->free_size += (p[1] & ~1) - off; |
582 | 702 | ||
703 | *occ_pages_p = pcpu_count_occupied_pages(chunk, i); | ||
704 | |||
583 | /* merge with next? */ | 705 | /* merge with next? */ |
584 | if (!(p[1] & 1)) | 706 | if (!(p[1] & 1)) |
585 | to_free++; | 707 | to_free++; |
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) | |||
620 | chunk->map_used = 1; | 742 | chunk->map_used = 1; |
621 | 743 | ||
622 | INIT_LIST_HEAD(&chunk->list); | 744 | INIT_LIST_HEAD(&chunk->list); |
745 | INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); | ||
623 | chunk->free_size = pcpu_unit_size; | 746 | chunk->free_size = pcpu_unit_size; |
624 | chunk->contig_hint = pcpu_unit_size; | 747 | chunk->contig_hint = pcpu_unit_size; |
625 | 748 | ||
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) | |||
634 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); | 757 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); |
635 | } | 758 | } |
636 | 759 | ||
760 | /** | ||
761 | * pcpu_chunk_populated - post-population bookkeeping | ||
762 | * @chunk: pcpu_chunk which got populated | ||
763 | * @page_start: the start page | ||
764 | * @page_end: the end page | ||
765 | * | ||
766 | * Pages in [@page_start,@page_end) have been populated to @chunk. Update | ||
767 | * the bookkeeping information accordingly. Must be called after each | ||
768 | * successful population. | ||
769 | */ | ||
770 | static void pcpu_chunk_populated(struct pcpu_chunk *chunk, | ||
771 | int page_start, int page_end) | ||
772 | { | ||
773 | int nr = page_end - page_start; | ||
774 | |||
775 | lockdep_assert_held(&pcpu_lock); | ||
776 | |||
777 | bitmap_set(chunk->populated, page_start, nr); | ||
778 | chunk->nr_populated += nr; | ||
779 | pcpu_nr_empty_pop_pages += nr; | ||
780 | } | ||
781 | |||
782 | /** | ||
783 | * pcpu_chunk_depopulated - post-depopulation bookkeeping | ||
784 | * @chunk: pcpu_chunk which got depopulated | ||
785 | * @page_start: the start page | ||
786 | * @page_end: the end page | ||
787 | * | ||
788 | * Pages in [@page_start,@page_end) have been depopulated from @chunk. | ||
789 | * Update the bookkeeping information accordingly. Must be called after | ||
790 | * each successful depopulation. | ||
791 | */ | ||
792 | static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, | ||
793 | int page_start, int page_end) | ||
794 | { | ||
795 | int nr = page_end - page_start; | ||
796 | |||
797 | lockdep_assert_held(&pcpu_lock); | ||
798 | |||
799 | bitmap_clear(chunk->populated, page_start, nr); | ||
800 | chunk->nr_populated -= nr; | ||
801 | pcpu_nr_empty_pop_pages -= nr; | ||
802 | } | ||
803 | |||
637 | /* | 804 | /* |
638 | * Chunk management implementation. | 805 | * Chunk management implementation. |
639 | * | 806 | * |
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | |||
695 | * @size: size of area to allocate in bytes | 862 | * @size: size of area to allocate in bytes |
696 | * @align: alignment of area (max PAGE_SIZE) | 863 | * @align: alignment of area (max PAGE_SIZE) |
697 | * @reserved: allocate from the reserved chunk if available | 864 | * @reserved: allocate from the reserved chunk if available |
865 | * @gfp: allocation flags | ||
698 | * | 866 | * |
699 | * Allocate percpu area of @size bytes aligned at @align. | 867 | * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't |
700 | * | 868 | * contain %GFP_KERNEL, the allocation is atomic. |
701 | * CONTEXT: | ||
702 | * Does GFP_KERNEL allocation. | ||
703 | * | 869 | * |
704 | * RETURNS: | 870 | * RETURNS: |
705 | * Percpu pointer to the allocated area on success, NULL on failure. | 871 | * Percpu pointer to the allocated area on success, NULL on failure. |
706 | */ | 872 | */ |
707 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) | 873 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, |
874 | gfp_t gfp) | ||
708 | { | 875 | { |
709 | static int warn_limit = 10; | 876 | static int warn_limit = 10; |
710 | struct pcpu_chunk *chunk; | 877 | struct pcpu_chunk *chunk; |
711 | const char *err; | 878 | const char *err; |
712 | int slot, off, new_alloc; | 879 | bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; |
880 | int occ_pages = 0; | ||
881 | int slot, off, new_alloc, cpu, ret; | ||
713 | unsigned long flags; | 882 | unsigned long flags; |
714 | void __percpu *ptr; | 883 | void __percpu *ptr; |
715 | 884 | ||
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
728 | return NULL; | 897 | return NULL; |
729 | } | 898 | } |
730 | 899 | ||
731 | mutex_lock(&pcpu_alloc_mutex); | ||
732 | spin_lock_irqsave(&pcpu_lock, flags); | 900 | spin_lock_irqsave(&pcpu_lock, flags); |
733 | 901 | ||
734 | /* serve reserved allocations from the reserved chunk if available */ | 902 | /* serve reserved allocations from the reserved chunk if available */ |
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
740 | goto fail_unlock; | 908 | goto fail_unlock; |
741 | } | 909 | } |
742 | 910 | ||
743 | while ((new_alloc = pcpu_need_to_extend(chunk))) { | 911 | while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { |
744 | spin_unlock_irqrestore(&pcpu_lock, flags); | 912 | spin_unlock_irqrestore(&pcpu_lock, flags); |
745 | if (pcpu_extend_area_map(chunk, new_alloc) < 0) { | 913 | if (is_atomic || |
914 | pcpu_extend_area_map(chunk, new_alloc) < 0) { | ||
746 | err = "failed to extend area map of reserved chunk"; | 915 | err = "failed to extend area map of reserved chunk"; |
747 | goto fail_unlock_mutex; | 916 | goto fail; |
748 | } | 917 | } |
749 | spin_lock_irqsave(&pcpu_lock, flags); | 918 | spin_lock_irqsave(&pcpu_lock, flags); |
750 | } | 919 | } |
751 | 920 | ||
752 | off = pcpu_alloc_area(chunk, size, align); | 921 | off = pcpu_alloc_area(chunk, size, align, is_atomic, |
922 | &occ_pages); | ||
753 | if (off >= 0) | 923 | if (off >= 0) |
754 | goto area_found; | 924 | goto area_found; |
755 | 925 | ||
@@ -764,13 +934,15 @@ restart: | |||
764 | if (size > chunk->contig_hint) | 934 | if (size > chunk->contig_hint) |
765 | continue; | 935 | continue; |
766 | 936 | ||
767 | new_alloc = pcpu_need_to_extend(chunk); | 937 | new_alloc = pcpu_need_to_extend(chunk, is_atomic); |
768 | if (new_alloc) { | 938 | if (new_alloc) { |
939 | if (is_atomic) | ||
940 | continue; | ||
769 | spin_unlock_irqrestore(&pcpu_lock, flags); | 941 | spin_unlock_irqrestore(&pcpu_lock, flags); |
770 | if (pcpu_extend_area_map(chunk, | 942 | if (pcpu_extend_area_map(chunk, |
771 | new_alloc) < 0) { | 943 | new_alloc) < 0) { |
772 | err = "failed to extend area map"; | 944 | err = "failed to extend area map"; |
773 | goto fail_unlock_mutex; | 945 | goto fail; |
774 | } | 946 | } |
775 | spin_lock_irqsave(&pcpu_lock, flags); | 947 | spin_lock_irqsave(&pcpu_lock, flags); |
776 | /* | 948 | /* |
@@ -780,74 +952,134 @@ restart: | |||
780 | goto restart; | 952 | goto restart; |
781 | } | 953 | } |
782 | 954 | ||
783 | off = pcpu_alloc_area(chunk, size, align); | 955 | off = pcpu_alloc_area(chunk, size, align, is_atomic, |
956 | &occ_pages); | ||
784 | if (off >= 0) | 957 | if (off >= 0) |
785 | goto area_found; | 958 | goto area_found; |
786 | } | 959 | } |
787 | } | 960 | } |
788 | 961 | ||
789 | /* hmmm... no space left, create a new chunk */ | ||
790 | spin_unlock_irqrestore(&pcpu_lock, flags); | 962 | spin_unlock_irqrestore(&pcpu_lock, flags); |
791 | 963 | ||
792 | chunk = pcpu_create_chunk(); | 964 | /* |
793 | if (!chunk) { | 965 | * No space left. Create a new chunk. We don't want multiple |
794 | err = "failed to allocate new chunk"; | 966 | * tasks to create chunks simultaneously. Serialize and create iff |
795 | goto fail_unlock_mutex; | 967 | * there's still no empty chunk after grabbing the mutex. |
968 | */ | ||
969 | if (is_atomic) | ||
970 | goto fail; | ||
971 | |||
972 | mutex_lock(&pcpu_alloc_mutex); | ||
973 | |||
974 | if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { | ||
975 | chunk = pcpu_create_chunk(); | ||
976 | if (!chunk) { | ||
977 | mutex_unlock(&pcpu_alloc_mutex); | ||
978 | err = "failed to allocate new chunk"; | ||
979 | goto fail; | ||
980 | } | ||
981 | |||
982 | spin_lock_irqsave(&pcpu_lock, flags); | ||
983 | pcpu_chunk_relocate(chunk, -1); | ||
984 | } else { | ||
985 | spin_lock_irqsave(&pcpu_lock, flags); | ||
796 | } | 986 | } |
797 | 987 | ||
798 | spin_lock_irqsave(&pcpu_lock, flags); | 988 | mutex_unlock(&pcpu_alloc_mutex); |
799 | pcpu_chunk_relocate(chunk, -1); | ||
800 | goto restart; | 989 | goto restart; |
801 | 990 | ||
802 | area_found: | 991 | area_found: |
803 | spin_unlock_irqrestore(&pcpu_lock, flags); | 992 | spin_unlock_irqrestore(&pcpu_lock, flags); |
804 | 993 | ||
805 | /* populate, map and clear the area */ | 994 | /* populate if not all pages are already there */ |
806 | if (pcpu_populate_chunk(chunk, off, size)) { | 995 | if (!is_atomic) { |
807 | spin_lock_irqsave(&pcpu_lock, flags); | 996 | int page_start, page_end, rs, re; |
808 | pcpu_free_area(chunk, off); | 997 | |
809 | err = "failed to populate"; | 998 | mutex_lock(&pcpu_alloc_mutex); |
810 | goto fail_unlock; | 999 | |
1000 | page_start = PFN_DOWN(off); | ||
1001 | page_end = PFN_UP(off + size); | ||
1002 | |||
1003 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | ||
1004 | WARN_ON(chunk->immutable); | ||
1005 | |||
1006 | ret = pcpu_populate_chunk(chunk, rs, re); | ||
1007 | |||
1008 | spin_lock_irqsave(&pcpu_lock, flags); | ||
1009 | if (ret) { | ||
1010 | mutex_unlock(&pcpu_alloc_mutex); | ||
1011 | pcpu_free_area(chunk, off, &occ_pages); | ||
1012 | err = "failed to populate"; | ||
1013 | goto fail_unlock; | ||
1014 | } | ||
1015 | pcpu_chunk_populated(chunk, rs, re); | ||
1016 | spin_unlock_irqrestore(&pcpu_lock, flags); | ||
1017 | } | ||
1018 | |||
1019 | mutex_unlock(&pcpu_alloc_mutex); | ||
811 | } | 1020 | } |
812 | 1021 | ||
813 | mutex_unlock(&pcpu_alloc_mutex); | 1022 | if (chunk != pcpu_reserved_chunk) |
1023 | pcpu_nr_empty_pop_pages -= occ_pages; | ||
1024 | |||
1025 | if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) | ||
1026 | pcpu_schedule_balance_work(); | ||
1027 | |||
1028 | /* clear the areas and return address relative to base address */ | ||
1029 | for_each_possible_cpu(cpu) | ||
1030 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | ||
814 | 1031 | ||
815 | /* return address relative to base address */ | ||
816 | ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); | 1032 | ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); |
817 | kmemleak_alloc_percpu(ptr, size); | 1033 | kmemleak_alloc_percpu(ptr, size); |
818 | return ptr; | 1034 | return ptr; |
819 | 1035 | ||
820 | fail_unlock: | 1036 | fail_unlock: |
821 | spin_unlock_irqrestore(&pcpu_lock, flags); | 1037 | spin_unlock_irqrestore(&pcpu_lock, flags); |
822 | fail_unlock_mutex: | 1038 | fail: |
823 | mutex_unlock(&pcpu_alloc_mutex); | 1039 | if (!is_atomic && warn_limit) { |
824 | if (warn_limit) { | 1040 | pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", |
825 | pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " | 1041 | size, align, is_atomic, err); |
826 | "%s\n", size, align, err); | ||
827 | dump_stack(); | 1042 | dump_stack(); |
828 | if (!--warn_limit) | 1043 | if (!--warn_limit) |
829 | pr_info("PERCPU: limit reached, disable warning\n"); | 1044 | pr_info("PERCPU: limit reached, disable warning\n"); |
830 | } | 1045 | } |
1046 | if (is_atomic) { | ||
1047 | /* see the flag handling in pcpu_blance_workfn() */ | ||
1048 | pcpu_atomic_alloc_failed = true; | ||
1049 | pcpu_schedule_balance_work(); | ||
1050 | } | ||
831 | return NULL; | 1051 | return NULL; |
832 | } | 1052 | } |
833 | 1053 | ||
834 | /** | 1054 | /** |
835 | * __alloc_percpu - allocate dynamic percpu area | 1055 | * __alloc_percpu_gfp - allocate dynamic percpu area |
836 | * @size: size of area to allocate in bytes | 1056 | * @size: size of area to allocate in bytes |
837 | * @align: alignment of area (max PAGE_SIZE) | 1057 | * @align: alignment of area (max PAGE_SIZE) |
1058 | * @gfp: allocation flags | ||
838 | * | 1059 | * |
839 | * Allocate zero-filled percpu area of @size bytes aligned at @align. | 1060 | * Allocate zero-filled percpu area of @size bytes aligned at @align. If |
840 | * Might sleep. Might trigger writeouts. | 1061 | * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can |
841 | * | 1062 | * be called from any context but is a lot more likely to fail. |
842 | * CONTEXT: | ||
843 | * Does GFP_KERNEL allocation. | ||
844 | * | 1063 | * |
845 | * RETURNS: | 1064 | * RETURNS: |
846 | * Percpu pointer to the allocated area on success, NULL on failure. | 1065 | * Percpu pointer to the allocated area on success, NULL on failure. |
847 | */ | 1066 | */ |
1067 | void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) | ||
1068 | { | ||
1069 | return pcpu_alloc(size, align, false, gfp); | ||
1070 | } | ||
1071 | EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); | ||
1072 | |||
1073 | /** | ||
1074 | * __alloc_percpu - allocate dynamic percpu area | ||
1075 | * @size: size of area to allocate in bytes | ||
1076 | * @align: alignment of area (max PAGE_SIZE) | ||
1077 | * | ||
1078 | * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). | ||
1079 | */ | ||
848 | void __percpu *__alloc_percpu(size_t size, size_t align) | 1080 | void __percpu *__alloc_percpu(size_t size, size_t align) |
849 | { | 1081 | { |
850 | return pcpu_alloc(size, align, false); | 1082 | return pcpu_alloc(size, align, false, GFP_KERNEL); |
851 | } | 1083 | } |
852 | EXPORT_SYMBOL_GPL(__alloc_percpu); | 1084 | EXPORT_SYMBOL_GPL(__alloc_percpu); |
853 | 1085 | ||
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); | |||
869 | */ | 1101 | */ |
870 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) | 1102 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) |
871 | { | 1103 | { |
872 | return pcpu_alloc(size, align, true); | 1104 | return pcpu_alloc(size, align, true, GFP_KERNEL); |
873 | } | 1105 | } |
874 | 1106 | ||
875 | /** | 1107 | /** |
876 | * pcpu_reclaim - reclaim fully free chunks, workqueue function | 1108 | * pcpu_balance_workfn - manage the amount of free chunks and populated pages |
877 | * @work: unused | 1109 | * @work: unused |
878 | * | 1110 | * |
879 | * Reclaim all fully free chunks except for the first one. | 1111 | * Reclaim all fully free chunks except for the first one. |
880 | * | ||
881 | * CONTEXT: | ||
882 | * workqueue context. | ||
883 | */ | 1112 | */ |
884 | static void pcpu_reclaim(struct work_struct *work) | 1113 | static void pcpu_balance_workfn(struct work_struct *work) |
885 | { | 1114 | { |
886 | LIST_HEAD(todo); | 1115 | LIST_HEAD(to_free); |
887 | struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; | 1116 | struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; |
888 | struct pcpu_chunk *chunk, *next; | 1117 | struct pcpu_chunk *chunk, *next; |
1118 | int slot, nr_to_pop, ret; | ||
889 | 1119 | ||
1120 | /* | ||
1121 | * There's no reason to keep around multiple unused chunks and VM | ||
1122 | * areas can be scarce. Destroy all free chunks except for one. | ||
1123 | */ | ||
890 | mutex_lock(&pcpu_alloc_mutex); | 1124 | mutex_lock(&pcpu_alloc_mutex); |
891 | spin_lock_irq(&pcpu_lock); | 1125 | spin_lock_irq(&pcpu_lock); |
892 | 1126 | ||
893 | list_for_each_entry_safe(chunk, next, head, list) { | 1127 | list_for_each_entry_safe(chunk, next, free_head, list) { |
894 | WARN_ON(chunk->immutable); | 1128 | WARN_ON(chunk->immutable); |
895 | 1129 | ||
896 | /* spare the first one */ | 1130 | /* spare the first one */ |
897 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) | 1131 | if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) |
898 | continue; | 1132 | continue; |
899 | 1133 | ||
900 | list_move(&chunk->list, &todo); | 1134 | list_move(&chunk->list, &to_free); |
901 | } | 1135 | } |
902 | 1136 | ||
903 | spin_unlock_irq(&pcpu_lock); | 1137 | spin_unlock_irq(&pcpu_lock); |
904 | 1138 | ||
905 | list_for_each_entry_safe(chunk, next, &todo, list) { | 1139 | list_for_each_entry_safe(chunk, next, &to_free, list) { |
906 | pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); | 1140 | int rs, re; |
1141 | |||
1142 | pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { | ||
1143 | pcpu_depopulate_chunk(chunk, rs, re); | ||
1144 | spin_lock_irq(&pcpu_lock); | ||
1145 | pcpu_chunk_depopulated(chunk, rs, re); | ||
1146 | spin_unlock_irq(&pcpu_lock); | ||
1147 | } | ||
907 | pcpu_destroy_chunk(chunk); | 1148 | pcpu_destroy_chunk(chunk); |
908 | } | 1149 | } |
909 | 1150 | ||
1151 | /* | ||
1152 | * Ensure there are certain number of free populated pages for | ||
1153 | * atomic allocs. Fill up from the most packed so that atomic | ||
1154 | * allocs don't increase fragmentation. If atomic allocation | ||
1155 | * failed previously, always populate the maximum amount. This | ||
1156 | * should prevent atomic allocs larger than PAGE_SIZE from keeping | ||
1157 | * failing indefinitely; however, large atomic allocs are not | ||
1158 | * something we support properly and can be highly unreliable and | ||
1159 | * inefficient. | ||
1160 | */ | ||
1161 | retry_pop: | ||
1162 | if (pcpu_atomic_alloc_failed) { | ||
1163 | nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; | ||
1164 | /* best effort anyway, don't worry about synchronization */ | ||
1165 | pcpu_atomic_alloc_failed = false; | ||
1166 | } else { | ||
1167 | nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - | ||
1168 | pcpu_nr_empty_pop_pages, | ||
1169 | 0, PCPU_EMPTY_POP_PAGES_HIGH); | ||
1170 | } | ||
1171 | |||
1172 | for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { | ||
1173 | int nr_unpop = 0, rs, re; | ||
1174 | |||
1175 | if (!nr_to_pop) | ||
1176 | break; | ||
1177 | |||
1178 | spin_lock_irq(&pcpu_lock); | ||
1179 | list_for_each_entry(chunk, &pcpu_slot[slot], list) { | ||
1180 | nr_unpop = pcpu_unit_pages - chunk->nr_populated; | ||
1181 | if (nr_unpop) | ||
1182 | break; | ||
1183 | } | ||
1184 | spin_unlock_irq(&pcpu_lock); | ||
1185 | |||
1186 | if (!nr_unpop) | ||
1187 | continue; | ||
1188 | |||
1189 | /* @chunk can't go away while pcpu_alloc_mutex is held */ | ||
1190 | pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { | ||
1191 | int nr = min(re - rs, nr_to_pop); | ||
1192 | |||
1193 | ret = pcpu_populate_chunk(chunk, rs, rs + nr); | ||
1194 | if (!ret) { | ||
1195 | nr_to_pop -= nr; | ||
1196 | spin_lock_irq(&pcpu_lock); | ||
1197 | pcpu_chunk_populated(chunk, rs, rs + nr); | ||
1198 | spin_unlock_irq(&pcpu_lock); | ||
1199 | } else { | ||
1200 | nr_to_pop = 0; | ||
1201 | } | ||
1202 | |||
1203 | if (!nr_to_pop) | ||
1204 | break; | ||
1205 | } | ||
1206 | } | ||
1207 | |||
1208 | if (nr_to_pop) { | ||
1209 | /* ran out of chunks to populate, create a new one and retry */ | ||
1210 | chunk = pcpu_create_chunk(); | ||
1211 | if (chunk) { | ||
1212 | spin_lock_irq(&pcpu_lock); | ||
1213 | pcpu_chunk_relocate(chunk, -1); | ||
1214 | spin_unlock_irq(&pcpu_lock); | ||
1215 | goto retry_pop; | ||
1216 | } | ||
1217 | } | ||
1218 | |||
910 | mutex_unlock(&pcpu_alloc_mutex); | 1219 | mutex_unlock(&pcpu_alloc_mutex); |
911 | } | 1220 | } |
912 | 1221 | ||
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr) | |||
924 | void *addr; | 1233 | void *addr; |
925 | struct pcpu_chunk *chunk; | 1234 | struct pcpu_chunk *chunk; |
926 | unsigned long flags; | 1235 | unsigned long flags; |
927 | int off; | 1236 | int off, occ_pages; |
928 | 1237 | ||
929 | if (!ptr) | 1238 | if (!ptr) |
930 | return; | 1239 | return; |
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr) | |||
938 | chunk = pcpu_chunk_addr_search(addr); | 1247 | chunk = pcpu_chunk_addr_search(addr); |
939 | off = addr - chunk->base_addr; | 1248 | off = addr - chunk->base_addr; |
940 | 1249 | ||
941 | pcpu_free_area(chunk, off); | 1250 | pcpu_free_area(chunk, off, &occ_pages); |
1251 | |||
1252 | if (chunk != pcpu_reserved_chunk) | ||
1253 | pcpu_nr_empty_pop_pages += occ_pages; | ||
942 | 1254 | ||
943 | /* if there are more than one fully free chunks, wake up grim reaper */ | 1255 | /* if there are more than one fully free chunks, wake up grim reaper */ |
944 | if (chunk->free_size == pcpu_unit_size) { | 1256 | if (chunk->free_size == pcpu_unit_size) { |
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr) | |||
946 | 1258 | ||
947 | list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) | 1259 | list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) |
948 | if (pos != chunk) { | 1260 | if (pos != chunk) { |
949 | schedule_work(&pcpu_reclaim_work); | 1261 | pcpu_schedule_balance_work(); |
950 | break; | 1262 | break; |
951 | } | 1263 | } |
952 | } | 1264 | } |
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1336 | */ | 1648 | */ |
1337 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); | 1649 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1338 | INIT_LIST_HEAD(&schunk->list); | 1650 | INIT_LIST_HEAD(&schunk->list); |
1651 | INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); | ||
1339 | schunk->base_addr = base_addr; | 1652 | schunk->base_addr = base_addr; |
1340 | schunk->map = smap; | 1653 | schunk->map = smap; |
1341 | schunk->map_alloc = ARRAY_SIZE(smap); | 1654 | schunk->map_alloc = ARRAY_SIZE(smap); |
1342 | schunk->immutable = true; | 1655 | schunk->immutable = true; |
1343 | bitmap_fill(schunk->populated, pcpu_unit_pages); | 1656 | bitmap_fill(schunk->populated, pcpu_unit_pages); |
1657 | schunk->nr_populated = pcpu_unit_pages; | ||
1344 | 1658 | ||
1345 | if (ai->reserved_size) { | 1659 | if (ai->reserved_size) { |
1346 | schunk->free_size = ai->reserved_size; | 1660 | schunk->free_size = ai->reserved_size; |
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1364 | if (dyn_size) { | 1678 | if (dyn_size) { |
1365 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); | 1679 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1366 | INIT_LIST_HEAD(&dchunk->list); | 1680 | INIT_LIST_HEAD(&dchunk->list); |
1681 | INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); | ||
1367 | dchunk->base_addr = base_addr; | 1682 | dchunk->base_addr = base_addr; |
1368 | dchunk->map = dmap; | 1683 | dchunk->map = dmap; |
1369 | dchunk->map_alloc = ARRAY_SIZE(dmap); | 1684 | dchunk->map_alloc = ARRAY_SIZE(dmap); |
1370 | dchunk->immutable = true; | 1685 | dchunk->immutable = true; |
1371 | bitmap_fill(dchunk->populated, pcpu_unit_pages); | 1686 | bitmap_fill(dchunk->populated, pcpu_unit_pages); |
1687 | dchunk->nr_populated = pcpu_unit_pages; | ||
1372 | 1688 | ||
1373 | dchunk->contig_hint = dchunk->free_size = dyn_size; | 1689 | dchunk->contig_hint = dchunk->free_size = dyn_size; |
1374 | dchunk->map[0] = 1; | 1690 | dchunk->map[0] = 1; |
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1379 | 1695 | ||
1380 | /* link the first chunk in */ | 1696 | /* link the first chunk in */ |
1381 | pcpu_first_chunk = dchunk ?: schunk; | 1697 | pcpu_first_chunk = dchunk ?: schunk; |
1698 | pcpu_nr_empty_pop_pages += | ||
1699 | pcpu_count_occupied_pages(pcpu_first_chunk, 1); | ||
1382 | pcpu_chunk_relocate(pcpu_first_chunk, -1); | 1700 | pcpu_chunk_relocate(pcpu_first_chunk, -1); |
1383 | 1701 | ||
1384 | /* we're done */ | 1702 | /* we're done */ |
@@ -1965,3 +2283,15 @@ void __init percpu_init_late(void) | |||
1965 | spin_unlock_irqrestore(&pcpu_lock, flags); | 2283 | spin_unlock_irqrestore(&pcpu_lock, flags); |
1966 | } | 2284 | } |
1967 | } | 2285 | } |
2286 | |||
2287 | /* | ||
2288 | * Percpu allocator is initialized early during boot when neither slab or | ||
2289 | * workqueue is available. Plug async management until everything is up | ||
2290 | * and running. | ||
2291 | */ | ||
2292 | static int __init percpu_enable_async(void) | ||
2293 | { | ||
2294 | pcpu_async_enabled = true; | ||
2295 | return 0; | ||
2296 | } | ||
2297 | subsys_initcall(percpu_enable_async); | ||
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index a8b919925934..dfb79e028ecb 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | |||
195 | pmd_t entry = *pmdp; | 195 | pmd_t entry = *pmdp; |
196 | if (pmd_numa(entry)) | 196 | if (pmd_numa(entry)) |
197 | entry = pmd_mknonnuma(entry); | 197 | entry = pmd_mknonnuma(entry); |
198 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); | 198 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); |
199 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 199 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
200 | } | 200 | } |
201 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 201 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
@@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
527 | unsigned long address = __vma_address(page, vma); | 527 | unsigned long address = __vma_address(page, vma); |
528 | 528 | ||
529 | /* page should be within @vma mapping range */ | 529 | /* page should be within @vma mapping range */ |
530 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 530 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); |
531 | 531 | ||
532 | return address; | 532 | return address; |
533 | } | 533 | } |
@@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page, | |||
897 | struct anon_vma *anon_vma = vma->anon_vma; | 897 | struct anon_vma *anon_vma = vma->anon_vma; |
898 | 898 | ||
899 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 899 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
900 | VM_BUG_ON(!anon_vma); | 900 | VM_BUG_ON_VMA(!anon_vma, vma); |
901 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); | 901 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); |
902 | 902 | ||
903 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 903 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
@@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page, | |||
1024 | void page_add_new_anon_rmap(struct page *page, | 1024 | void page_add_new_anon_rmap(struct page *page, |
1025 | struct vm_area_struct *vma, unsigned long address) | 1025 | struct vm_area_struct *vma, unsigned long address) |
1026 | { | 1026 | { |
1027 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1027 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); |
1028 | SetPageSwapBacked(page); | 1028 | SetPageSwapBacked(page); |
1029 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1029 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
1030 | if (PageTransHuge(page)) | 1030 | if (PageTransHuge(page)) |
@@ -1355,7 +1355,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1355 | continue; /* don't unmap */ | 1355 | continue; /* don't unmap */ |
1356 | } | 1356 | } |
1357 | 1357 | ||
1358 | if (ptep_clear_flush_young_notify(vma, address, pte)) | 1358 | /* |
1359 | * No need for _notify because we're within an | ||
1360 | * mmu_notifier_invalidate_range_ {start|end} scope. | ||
1361 | */ | ||
1362 | if (ptep_clear_flush_young(vma, address, pte)) | ||
1359 | continue; | 1363 | continue; |
1360 | 1364 | ||
1361 | /* Nuke the page table entry. */ | 1365 | /* Nuke the page table entry. */ |
@@ -1666,7 +1670,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1666 | * structure at mapping cannot be freed and reused yet, | 1670 | * structure at mapping cannot be freed and reused yet, |
1667 | * so we can safely take mapping->i_mmap_mutex. | 1671 | * so we can safely take mapping->i_mmap_mutex. |
1668 | */ | 1672 | */ |
1669 | VM_BUG_ON(!PageLocked(page)); | 1673 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1670 | 1674 | ||
1671 | if (!mapping) | 1675 | if (!mapping) |
1672 | return ret; | 1676 | return ret; |
diff --git a/mm/shmem.c b/mm/shmem.c index 0e5fb225007c..cd6fc7590e54 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2367,8 +2367,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc | |||
2367 | 2367 | ||
2368 | if (new_dentry->d_inode) { | 2368 | if (new_dentry->d_inode) { |
2369 | (void) shmem_unlink(new_dir, new_dentry); | 2369 | (void) shmem_unlink(new_dir, new_dentry); |
2370 | if (they_are_dirs) | 2370 | if (they_are_dirs) { |
2371 | drop_nlink(new_dentry->d_inode); | ||
2371 | drop_nlink(old_dir); | 2372 | drop_nlink(old_dir); |
2373 | } | ||
2372 | } else if (they_are_dirs) { | 2374 | } else if (they_are_dirs) { |
2373 | drop_nlink(old_dir); | 2375 | drop_nlink(old_dir); |
2374 | inc_nlink(new_dir); | 2376 | inc_nlink(new_dir); |
@@ -2993,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2993 | #endif | 2995 | #endif |
2994 | 2996 | ||
2995 | spin_lock_init(&sbinfo->stat_lock); | 2997 | spin_lock_init(&sbinfo->stat_lock); |
2996 | if (percpu_counter_init(&sbinfo->used_blocks, 0)) | 2998 | if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) |
2997 | goto failed; | 2999 | goto failed; |
2998 | sbinfo->free_inodes = sbinfo->max_inodes; | 3000 | sbinfo->free_inodes = sbinfo->max_inodes; |
2999 | 3001 | ||
@@ -3075,7 +3077,9 @@ static const struct address_space_operations shmem_aops = { | |||
3075 | .write_begin = shmem_write_begin, | 3077 | .write_begin = shmem_write_begin, |
3076 | .write_end = shmem_write_end, | 3078 | .write_end = shmem_write_end, |
3077 | #endif | 3079 | #endif |
3080 | #ifdef CONFIG_MIGRATION | ||
3078 | .migratepage = migrate_page, | 3081 | .migratepage = migrate_page, |
3082 | #endif | ||
3079 | .error_remove_page = generic_error_remove_page, | 3083 | .error_remove_page = generic_error_remove_page, |
3080 | }; | 3084 | }; |
3081 | 3085 | ||
@@ -237,11 +237,10 @@ struct arraycache_init { | |||
237 | /* | 237 | /* |
238 | * Need this for bootstrapping a per node allocator. | 238 | * Need this for bootstrapping a per node allocator. |
239 | */ | 239 | */ |
240 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) | 240 | #define NUM_INIT_LISTS (2 * MAX_NUMNODES) |
241 | static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; | 241 | static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; |
242 | #define CACHE_CACHE 0 | 242 | #define CACHE_CACHE 0 |
243 | #define SIZE_AC MAX_NUMNODES | 243 | #define SIZE_NODE (MAX_NUMNODES) |
244 | #define SIZE_NODE (2 * MAX_NUMNODES) | ||
245 | 244 | ||
246 | static int drain_freelist(struct kmem_cache *cache, | 245 | static int drain_freelist(struct kmem_cache *cache, |
247 | struct kmem_cache_node *n, int tofree); | 246 | struct kmem_cache_node *n, int tofree); |
@@ -253,7 +252,6 @@ static void cache_reap(struct work_struct *unused); | |||
253 | 252 | ||
254 | static int slab_early_init = 1; | 253 | static int slab_early_init = 1; |
255 | 254 | ||
256 | #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) | ||
257 | #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) | 255 | #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) |
258 | 256 | ||
259 | static void kmem_cache_node_init(struct kmem_cache_node *parent) | 257 | static void kmem_cache_node_init(struct kmem_cache_node *parent) |
@@ -458,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, | |||
458 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); | 456 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); |
459 | } | 457 | } |
460 | 458 | ||
461 | static struct arraycache_init initarray_generic = | ||
462 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | ||
463 | |||
464 | /* internal cache of cache description objs */ | 459 | /* internal cache of cache description objs */ |
465 | static struct kmem_cache kmem_cache_boot = { | 460 | static struct kmem_cache kmem_cache_boot = { |
466 | .batchcount = 1, | 461 | .batchcount = 1, |
@@ -476,7 +471,7 @@ static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); | |||
476 | 471 | ||
477 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 472 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
478 | { | 473 | { |
479 | return cachep->array[smp_processor_id()]; | 474 | return this_cpu_ptr(cachep->cpu_cache); |
480 | } | 475 | } |
481 | 476 | ||
482 | static size_t calculate_freelist_size(int nr_objs, size_t align) | 477 | static size_t calculate_freelist_size(int nr_objs, size_t align) |
@@ -785,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep, | |||
785 | return objp; | 780 | return objp; |
786 | } | 781 | } |
787 | 782 | ||
788 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | 783 | static noinline void *__ac_put_obj(struct kmem_cache *cachep, |
789 | void *objp) | 784 | struct array_cache *ac, void *objp) |
790 | { | 785 | { |
791 | if (unlikely(pfmemalloc_active)) { | 786 | if (unlikely(pfmemalloc_active)) { |
792 | /* Some pfmemalloc slabs exist, check if this is one */ | 787 | /* Some pfmemalloc slabs exist, check if this is one */ |
@@ -984,46 +979,50 @@ static void drain_alien_cache(struct kmem_cache *cachep, | |||
984 | } | 979 | } |
985 | } | 980 | } |
986 | 981 | ||
987 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | 982 | static int __cache_free_alien(struct kmem_cache *cachep, void *objp, |
983 | int node, int page_node) | ||
988 | { | 984 | { |
989 | int nodeid = page_to_nid(virt_to_page(objp)); | ||
990 | struct kmem_cache_node *n; | 985 | struct kmem_cache_node *n; |
991 | struct alien_cache *alien = NULL; | 986 | struct alien_cache *alien = NULL; |
992 | struct array_cache *ac; | 987 | struct array_cache *ac; |
993 | int node; | ||
994 | LIST_HEAD(list); | 988 | LIST_HEAD(list); |
995 | 989 | ||
996 | node = numa_mem_id(); | ||
997 | |||
998 | /* | ||
999 | * Make sure we are not freeing a object from another node to the array | ||
1000 | * cache on this cpu. | ||
1001 | */ | ||
1002 | if (likely(nodeid == node)) | ||
1003 | return 0; | ||
1004 | |||
1005 | n = get_node(cachep, node); | 990 | n = get_node(cachep, node); |
1006 | STATS_INC_NODEFREES(cachep); | 991 | STATS_INC_NODEFREES(cachep); |
1007 | if (n->alien && n->alien[nodeid]) { | 992 | if (n->alien && n->alien[page_node]) { |
1008 | alien = n->alien[nodeid]; | 993 | alien = n->alien[page_node]; |
1009 | ac = &alien->ac; | 994 | ac = &alien->ac; |
1010 | spin_lock(&alien->lock); | 995 | spin_lock(&alien->lock); |
1011 | if (unlikely(ac->avail == ac->limit)) { | 996 | if (unlikely(ac->avail == ac->limit)) { |
1012 | STATS_INC_ACOVERFLOW(cachep); | 997 | STATS_INC_ACOVERFLOW(cachep); |
1013 | __drain_alien_cache(cachep, ac, nodeid, &list); | 998 | __drain_alien_cache(cachep, ac, page_node, &list); |
1014 | } | 999 | } |
1015 | ac_put_obj(cachep, ac, objp); | 1000 | ac_put_obj(cachep, ac, objp); |
1016 | spin_unlock(&alien->lock); | 1001 | spin_unlock(&alien->lock); |
1017 | slabs_destroy(cachep, &list); | 1002 | slabs_destroy(cachep, &list); |
1018 | } else { | 1003 | } else { |
1019 | n = get_node(cachep, nodeid); | 1004 | n = get_node(cachep, page_node); |
1020 | spin_lock(&n->list_lock); | 1005 | spin_lock(&n->list_lock); |
1021 | free_block(cachep, &objp, 1, nodeid, &list); | 1006 | free_block(cachep, &objp, 1, page_node, &list); |
1022 | spin_unlock(&n->list_lock); | 1007 | spin_unlock(&n->list_lock); |
1023 | slabs_destroy(cachep, &list); | 1008 | slabs_destroy(cachep, &list); |
1024 | } | 1009 | } |
1025 | return 1; | 1010 | return 1; |
1026 | } | 1011 | } |
1012 | |||
1013 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
1014 | { | ||
1015 | int page_node = page_to_nid(virt_to_page(objp)); | ||
1016 | int node = numa_mem_id(); | ||
1017 | /* | ||
1018 | * Make sure we are not freeing a object from another node to the array | ||
1019 | * cache on this cpu. | ||
1020 | */ | ||
1021 | if (likely(node == page_node)) | ||
1022 | return 0; | ||
1023 | |||
1024 | return __cache_free_alien(cachep, objp, node, page_node); | ||
1025 | } | ||
1027 | #endif | 1026 | #endif |
1028 | 1027 | ||
1029 | /* | 1028 | /* |
@@ -1092,24 +1091,25 @@ static void cpuup_canceled(long cpu) | |||
1092 | struct alien_cache **alien; | 1091 | struct alien_cache **alien; |
1093 | LIST_HEAD(list); | 1092 | LIST_HEAD(list); |
1094 | 1093 | ||
1095 | /* cpu is dead; no one can alloc from it. */ | ||
1096 | nc = cachep->array[cpu]; | ||
1097 | cachep->array[cpu] = NULL; | ||
1098 | n = get_node(cachep, node); | 1094 | n = get_node(cachep, node); |
1099 | |||
1100 | if (!n) | 1095 | if (!n) |
1101 | goto free_array_cache; | 1096 | continue; |
1102 | 1097 | ||
1103 | spin_lock_irq(&n->list_lock); | 1098 | spin_lock_irq(&n->list_lock); |
1104 | 1099 | ||
1105 | /* Free limit for this kmem_cache_node */ | 1100 | /* Free limit for this kmem_cache_node */ |
1106 | n->free_limit -= cachep->batchcount; | 1101 | n->free_limit -= cachep->batchcount; |
1107 | if (nc) | 1102 | |
1103 | /* cpu is dead; no one can alloc from it. */ | ||
1104 | nc = per_cpu_ptr(cachep->cpu_cache, cpu); | ||
1105 | if (nc) { | ||
1108 | free_block(cachep, nc->entry, nc->avail, node, &list); | 1106 | free_block(cachep, nc->entry, nc->avail, node, &list); |
1107 | nc->avail = 0; | ||
1108 | } | ||
1109 | 1109 | ||
1110 | if (!cpumask_empty(mask)) { | 1110 | if (!cpumask_empty(mask)) { |
1111 | spin_unlock_irq(&n->list_lock); | 1111 | spin_unlock_irq(&n->list_lock); |
1112 | goto free_array_cache; | 1112 | goto free_slab; |
1113 | } | 1113 | } |
1114 | 1114 | ||
1115 | shared = n->shared; | 1115 | shared = n->shared; |
@@ -1129,9 +1129,9 @@ static void cpuup_canceled(long cpu) | |||
1129 | drain_alien_cache(cachep, alien); | 1129 | drain_alien_cache(cachep, alien); |
1130 | free_alien_cache(alien); | 1130 | free_alien_cache(alien); |
1131 | } | 1131 | } |
1132 | free_array_cache: | 1132 | |
1133 | free_slab: | ||
1133 | slabs_destroy(cachep, &list); | 1134 | slabs_destroy(cachep, &list); |
1134 | kfree(nc); | ||
1135 | } | 1135 | } |
1136 | /* | 1136 | /* |
1137 | * In the previous loop, all the objects were freed to | 1137 | * In the previous loop, all the objects were freed to |
@@ -1168,32 +1168,23 @@ static int cpuup_prepare(long cpu) | |||
1168 | * array caches | 1168 | * array caches |
1169 | */ | 1169 | */ |
1170 | list_for_each_entry(cachep, &slab_caches, list) { | 1170 | list_for_each_entry(cachep, &slab_caches, list) { |
1171 | struct array_cache *nc; | ||
1172 | struct array_cache *shared = NULL; | 1171 | struct array_cache *shared = NULL; |
1173 | struct alien_cache **alien = NULL; | 1172 | struct alien_cache **alien = NULL; |
1174 | 1173 | ||
1175 | nc = alloc_arraycache(node, cachep->limit, | ||
1176 | cachep->batchcount, GFP_KERNEL); | ||
1177 | if (!nc) | ||
1178 | goto bad; | ||
1179 | if (cachep->shared) { | 1174 | if (cachep->shared) { |
1180 | shared = alloc_arraycache(node, | 1175 | shared = alloc_arraycache(node, |
1181 | cachep->shared * cachep->batchcount, | 1176 | cachep->shared * cachep->batchcount, |
1182 | 0xbaadf00d, GFP_KERNEL); | 1177 | 0xbaadf00d, GFP_KERNEL); |
1183 | if (!shared) { | 1178 | if (!shared) |
1184 | kfree(nc); | ||
1185 | goto bad; | 1179 | goto bad; |
1186 | } | ||
1187 | } | 1180 | } |
1188 | if (use_alien_caches) { | 1181 | if (use_alien_caches) { |
1189 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); | 1182 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); |
1190 | if (!alien) { | 1183 | if (!alien) { |
1191 | kfree(shared); | 1184 | kfree(shared); |
1192 | kfree(nc); | ||
1193 | goto bad; | 1185 | goto bad; |
1194 | } | 1186 | } |
1195 | } | 1187 | } |
1196 | cachep->array[cpu] = nc; | ||
1197 | n = get_node(cachep, node); | 1188 | n = get_node(cachep, node); |
1198 | BUG_ON(!n); | 1189 | BUG_ON(!n); |
1199 | 1190 | ||
@@ -1385,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) | |||
1385 | } | 1376 | } |
1386 | 1377 | ||
1387 | /* | 1378 | /* |
1388 | * The memory after the last cpu cache pointer is used for the | ||
1389 | * the node pointer. | ||
1390 | */ | ||
1391 | static void setup_node_pointer(struct kmem_cache *cachep) | ||
1392 | { | ||
1393 | cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; | ||
1394 | } | ||
1395 | |||
1396 | /* | ||
1397 | * Initialisation. Called after the page allocator have been initialised and | 1379 | * Initialisation. Called after the page allocator have been initialised and |
1398 | * before smp_init(). | 1380 | * before smp_init(). |
1399 | */ | 1381 | */ |
@@ -1404,7 +1386,6 @@ void __init kmem_cache_init(void) | |||
1404 | BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < | 1386 | BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < |
1405 | sizeof(struct rcu_head)); | 1387 | sizeof(struct rcu_head)); |
1406 | kmem_cache = &kmem_cache_boot; | 1388 | kmem_cache = &kmem_cache_boot; |
1407 | setup_node_pointer(kmem_cache); | ||
1408 | 1389 | ||
1409 | if (num_possible_nodes() == 1) | 1390 | if (num_possible_nodes() == 1) |
1410 | use_alien_caches = 0; | 1391 | use_alien_caches = 0; |
@@ -1412,8 +1393,6 @@ void __init kmem_cache_init(void) | |||
1412 | for (i = 0; i < NUM_INIT_LISTS; i++) | 1393 | for (i = 0; i < NUM_INIT_LISTS; i++) |
1413 | kmem_cache_node_init(&init_kmem_cache_node[i]); | 1394 | kmem_cache_node_init(&init_kmem_cache_node[i]); |
1414 | 1395 | ||
1415 | set_up_node(kmem_cache, CACHE_CACHE); | ||
1416 | |||
1417 | /* | 1396 | /* |
1418 | * Fragmentation resistance on low memory - only use bigger | 1397 | * Fragmentation resistance on low memory - only use bigger |
1419 | * page orders on machines with more than 32MB of memory if | 1398 | * page orders on machines with more than 32MB of memory if |
@@ -1448,49 +1427,22 @@ void __init kmem_cache_init(void) | |||
1448 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1427 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1449 | */ | 1428 | */ |
1450 | create_boot_cache(kmem_cache, "kmem_cache", | 1429 | create_boot_cache(kmem_cache, "kmem_cache", |
1451 | offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1430 | offsetof(struct kmem_cache, node) + |
1452 | nr_node_ids * sizeof(struct kmem_cache_node *), | 1431 | nr_node_ids * sizeof(struct kmem_cache_node *), |
1453 | SLAB_HWCACHE_ALIGN); | 1432 | SLAB_HWCACHE_ALIGN); |
1454 | list_add(&kmem_cache->list, &slab_caches); | 1433 | list_add(&kmem_cache->list, &slab_caches); |
1455 | 1434 | slab_state = PARTIAL; | |
1456 | /* 2+3) create the kmalloc caches */ | ||
1457 | 1435 | ||
1458 | /* | 1436 | /* |
1459 | * Initialize the caches that provide memory for the array cache and the | 1437 | * Initialize the caches that provide memory for the kmem_cache_node |
1460 | * kmem_cache_node structures first. Without this, further allocations will | 1438 | * structures first. Without this, further allocations will bug. |
1461 | * bug. | ||
1462 | */ | 1439 | */ |
1463 | 1440 | kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", | |
1464 | kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", | ||
1465 | kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); | ||
1466 | |||
1467 | if (INDEX_AC != INDEX_NODE) | ||
1468 | kmalloc_caches[INDEX_NODE] = | ||
1469 | create_kmalloc_cache("kmalloc-node", | ||
1470 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); | 1441 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); |
1442 | slab_state = PARTIAL_NODE; | ||
1471 | 1443 | ||
1472 | slab_early_init = 0; | 1444 | slab_early_init = 0; |
1473 | 1445 | ||
1474 | /* 4) Replace the bootstrap head arrays */ | ||
1475 | { | ||
1476 | struct array_cache *ptr; | ||
1477 | |||
1478 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | ||
1479 | |||
1480 | memcpy(ptr, cpu_cache_get(kmem_cache), | ||
1481 | sizeof(struct arraycache_init)); | ||
1482 | |||
1483 | kmem_cache->array[smp_processor_id()] = ptr; | ||
1484 | |||
1485 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | ||
1486 | |||
1487 | BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) | ||
1488 | != &initarray_generic.cache); | ||
1489 | memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), | ||
1490 | sizeof(struct arraycache_init)); | ||
1491 | |||
1492 | kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; | ||
1493 | } | ||
1494 | /* 5) Replace the bootstrap kmem_cache_node */ | 1446 | /* 5) Replace the bootstrap kmem_cache_node */ |
1495 | { | 1447 | { |
1496 | int nid; | 1448 | int nid; |
@@ -1498,13 +1450,8 @@ void __init kmem_cache_init(void) | |||
1498 | for_each_online_node(nid) { | 1450 | for_each_online_node(nid) { |
1499 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); | 1451 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); |
1500 | 1452 | ||
1501 | init_list(kmalloc_caches[INDEX_AC], | 1453 | init_list(kmalloc_caches[INDEX_NODE], |
1502 | &init_kmem_cache_node[SIZE_AC + nid], nid); | ||
1503 | |||
1504 | if (INDEX_AC != INDEX_NODE) { | ||
1505 | init_list(kmalloc_caches[INDEX_NODE], | ||
1506 | &init_kmem_cache_node[SIZE_NODE + nid], nid); | 1454 | &init_kmem_cache_node[SIZE_NODE + nid], nid); |
1507 | } | ||
1508 | } | 1455 | } |
1509 | } | 1456 | } |
1510 | 1457 | ||
@@ -2037,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2037 | return left_over; | 1984 | return left_over; |
2038 | } | 1985 | } |
2039 | 1986 | ||
1987 | static struct array_cache __percpu *alloc_kmem_cache_cpus( | ||
1988 | struct kmem_cache *cachep, int entries, int batchcount) | ||
1989 | { | ||
1990 | int cpu; | ||
1991 | size_t size; | ||
1992 | struct array_cache __percpu *cpu_cache; | ||
1993 | |||
1994 | size = sizeof(void *) * entries + sizeof(struct array_cache); | ||
1995 | cpu_cache = __alloc_percpu(size, 0); | ||
1996 | |||
1997 | if (!cpu_cache) | ||
1998 | return NULL; | ||
1999 | |||
2000 | for_each_possible_cpu(cpu) { | ||
2001 | init_arraycache(per_cpu_ptr(cpu_cache, cpu), | ||
2002 | entries, batchcount); | ||
2003 | } | ||
2004 | |||
2005 | return cpu_cache; | ||
2006 | } | ||
2007 | |||
2040 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | 2008 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
2041 | { | 2009 | { |
2042 | if (slab_state >= FULL) | 2010 | if (slab_state >= FULL) |
2043 | return enable_cpucache(cachep, gfp); | 2011 | return enable_cpucache(cachep, gfp); |
2044 | 2012 | ||
2013 | cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); | ||
2014 | if (!cachep->cpu_cache) | ||
2015 | return 1; | ||
2016 | |||
2045 | if (slab_state == DOWN) { | 2017 | if (slab_state == DOWN) { |
2046 | /* | 2018 | /* Creation of first cache (kmem_cache). */ |
2047 | * Note: Creation of first cache (kmem_cache). | 2019 | set_up_node(kmem_cache, CACHE_CACHE); |
2048 | * The setup_node is taken care | ||
2049 | * of by the caller of __kmem_cache_create | ||
2050 | */ | ||
2051 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
2052 | slab_state = PARTIAL; | ||
2053 | } else if (slab_state == PARTIAL) { | 2020 | } else if (slab_state == PARTIAL) { |
2054 | /* | 2021 | /* For kmem_cache_node */ |
2055 | * Note: the second kmem_cache_create must create the cache | 2022 | set_up_node(cachep, SIZE_NODE); |
2056 | * that's used by kmalloc(24), otherwise the creation of | ||
2057 | * further caches will BUG(). | ||
2058 | */ | ||
2059 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
2060 | |||
2061 | /* | ||
2062 | * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is | ||
2063 | * the second cache, then we need to set up all its node/, | ||
2064 | * otherwise the creation of further caches will BUG(). | ||
2065 | */ | ||
2066 | set_up_node(cachep, SIZE_AC); | ||
2067 | if (INDEX_AC == INDEX_NODE) | ||
2068 | slab_state = PARTIAL_NODE; | ||
2069 | else | ||
2070 | slab_state = PARTIAL_ARRAYCACHE; | ||
2071 | } else { | 2023 | } else { |
2072 | /* Remaining boot caches */ | 2024 | int node; |
2073 | cachep->array[smp_processor_id()] = | ||
2074 | kmalloc(sizeof(struct arraycache_init), gfp); | ||
2075 | 2025 | ||
2076 | if (slab_state == PARTIAL_ARRAYCACHE) { | 2026 | for_each_online_node(node) { |
2077 | set_up_node(cachep, SIZE_NODE); | 2027 | cachep->node[node] = kmalloc_node( |
2078 | slab_state = PARTIAL_NODE; | 2028 | sizeof(struct kmem_cache_node), gfp, node); |
2079 | } else { | 2029 | BUG_ON(!cachep->node[node]); |
2080 | int node; | 2030 | kmem_cache_node_init(cachep->node[node]); |
2081 | for_each_online_node(node) { | ||
2082 | cachep->node[node] = | ||
2083 | kmalloc_node(sizeof(struct kmem_cache_node), | ||
2084 | gfp, node); | ||
2085 | BUG_ON(!cachep->node[node]); | ||
2086 | kmem_cache_node_init(cachep->node[node]); | ||
2087 | } | ||
2088 | } | 2031 | } |
2089 | } | 2032 | } |
2033 | |||
2090 | cachep->node[numa_mem_id()]->next_reap = | 2034 | cachep->node[numa_mem_id()]->next_reap = |
2091 | jiffies + REAPTIMEOUT_NODE + | 2035 | jiffies + REAPTIMEOUT_NODE + |
2092 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; | 2036 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; |
@@ -2100,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2100 | return 0; | 2044 | return 0; |
2101 | } | 2045 | } |
2102 | 2046 | ||
2047 | unsigned long kmem_cache_flags(unsigned long object_size, | ||
2048 | unsigned long flags, const char *name, | ||
2049 | void (*ctor)(void *)) | ||
2050 | { | ||
2051 | return flags; | ||
2052 | } | ||
2053 | |||
2054 | struct kmem_cache * | ||
2055 | __kmem_cache_alias(const char *name, size_t size, size_t align, | ||
2056 | unsigned long flags, void (*ctor)(void *)) | ||
2057 | { | ||
2058 | struct kmem_cache *cachep; | ||
2059 | |||
2060 | cachep = find_mergeable(size, align, flags, name, ctor); | ||
2061 | if (cachep) { | ||
2062 | cachep->refcount++; | ||
2063 | |||
2064 | /* | ||
2065 | * Adjust the object sizes so that we clear | ||
2066 | * the complete object on kzalloc. | ||
2067 | */ | ||
2068 | cachep->object_size = max_t(int, cachep->object_size, size); | ||
2069 | } | ||
2070 | return cachep; | ||
2071 | } | ||
2072 | |||
2103 | /** | 2073 | /** |
2104 | * __kmem_cache_create - Create a cache. | 2074 | * __kmem_cache_create - Create a cache. |
2105 | * @cachep: cache management descriptor | 2075 | * @cachep: cache management descriptor |
@@ -2124,7 +2094,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2124 | int | 2094 | int |
2125 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | 2095 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) |
2126 | { | 2096 | { |
2127 | size_t left_over, freelist_size, ralign; | 2097 | size_t left_over, freelist_size; |
2098 | size_t ralign = BYTES_PER_WORD; | ||
2128 | gfp_t gfp; | 2099 | gfp_t gfp; |
2129 | int err; | 2100 | int err; |
2130 | size_t size = cachep->size; | 2101 | size_t size = cachep->size; |
@@ -2157,14 +2128,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2157 | size &= ~(BYTES_PER_WORD - 1); | 2128 | size &= ~(BYTES_PER_WORD - 1); |
2158 | } | 2129 | } |
2159 | 2130 | ||
2160 | /* | ||
2161 | * Redzoning and user store require word alignment or possibly larger. | ||
2162 | * Note this will be overridden by architecture or caller mandated | ||
2163 | * alignment if either is greater than BYTES_PER_WORD. | ||
2164 | */ | ||
2165 | if (flags & SLAB_STORE_USER) | ||
2166 | ralign = BYTES_PER_WORD; | ||
2167 | |||
2168 | if (flags & SLAB_RED_ZONE) { | 2131 | if (flags & SLAB_RED_ZONE) { |
2169 | ralign = REDZONE_ALIGN; | 2132 | ralign = REDZONE_ALIGN; |
2170 | /* If redzoning, ensure that the second redzone is suitably | 2133 | /* If redzoning, ensure that the second redzone is suitably |
@@ -2190,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2190 | else | 2153 | else |
2191 | gfp = GFP_NOWAIT; | 2154 | gfp = GFP_NOWAIT; |
2192 | 2155 | ||
2193 | setup_node_pointer(cachep); | ||
2194 | #if DEBUG | 2156 | #if DEBUG |
2195 | 2157 | ||
2196 | /* | 2158 | /* |
@@ -2447,8 +2409,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) | |||
2447 | if (rc) | 2409 | if (rc) |
2448 | return rc; | 2410 | return rc; |
2449 | 2411 | ||
2450 | for_each_online_cpu(i) | 2412 | free_percpu(cachep->cpu_cache); |
2451 | kfree(cachep->array[i]); | ||
2452 | 2413 | ||
2453 | /* NUMA: free the node structures */ | 2414 | /* NUMA: free the node structures */ |
2454 | for_each_kmem_cache_node(cachep, i, n) { | 2415 | for_each_kmem_cache_node(cachep, i, n) { |
@@ -2994,7 +2955,7 @@ out: | |||
2994 | 2955 | ||
2995 | #ifdef CONFIG_NUMA | 2956 | #ifdef CONFIG_NUMA |
2996 | /* | 2957 | /* |
2997 | * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. | 2958 | * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. |
2998 | * | 2959 | * |
2999 | * If we are in_interrupt, then process context, including cpusets and | 2960 | * If we are in_interrupt, then process context, including cpusets and |
3000 | * mempolicy, may not apply and should not be used for allocation policy. | 2961 | * mempolicy, may not apply and should not be used for allocation policy. |
@@ -3226,7 +3187,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3226 | { | 3187 | { |
3227 | void *objp; | 3188 | void *objp; |
3228 | 3189 | ||
3229 | if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { | 3190 | if (current->mempolicy || cpuset_do_slab_mem_spread()) { |
3230 | objp = alternate_node_alloc(cache, flags); | 3191 | objp = alternate_node_alloc(cache, flags); |
3231 | if (objp) | 3192 | if (objp) |
3232 | goto out; | 3193 | goto out; |
@@ -3406,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3406 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) | 3367 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
3407 | return; | 3368 | return; |
3408 | 3369 | ||
3409 | if (likely(ac->avail < ac->limit)) { | 3370 | if (ac->avail < ac->limit) { |
3410 | STATS_INC_FREEHIT(cachep); | 3371 | STATS_INC_FREEHIT(cachep); |
3411 | } else { | 3372 | } else { |
3412 | STATS_INC_FREEMISS(cachep); | 3373 | STATS_INC_FREEMISS(cachep); |
@@ -3503,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) | |||
3503 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); | 3464 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); |
3504 | } | 3465 | } |
3505 | 3466 | ||
3506 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | ||
3507 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3467 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3508 | { | 3468 | { |
3509 | return __do_kmalloc_node(size, flags, node, _RET_IP_); | 3469 | return __do_kmalloc_node(size, flags, node, _RET_IP_); |
@@ -3516,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, | |||
3516 | return __do_kmalloc_node(size, flags, node, caller); | 3476 | return __do_kmalloc_node(size, flags, node, caller); |
3517 | } | 3477 | } |
3518 | EXPORT_SYMBOL(__kmalloc_node_track_caller); | 3478 | EXPORT_SYMBOL(__kmalloc_node_track_caller); |
3519 | #else | ||
3520 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3521 | { | ||
3522 | return __do_kmalloc_node(size, flags, node, 0); | ||
3523 | } | ||
3524 | EXPORT_SYMBOL(__kmalloc_node); | ||
3525 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ | ||
3526 | #endif /* CONFIG_NUMA */ | 3479 | #endif /* CONFIG_NUMA */ |
3527 | 3480 | ||
3528 | /** | 3481 | /** |
@@ -3548,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3548 | return ret; | 3501 | return ret; |
3549 | } | 3502 | } |
3550 | 3503 | ||
3551 | |||
3552 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | ||
3553 | void *__kmalloc(size_t size, gfp_t flags) | 3504 | void *__kmalloc(size_t size, gfp_t flags) |
3554 | { | 3505 | { |
3555 | return __do_kmalloc(size, flags, _RET_IP_); | 3506 | return __do_kmalloc(size, flags, _RET_IP_); |
@@ -3562,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) | |||
3562 | } | 3513 | } |
3563 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3514 | EXPORT_SYMBOL(__kmalloc_track_caller); |
3564 | 3515 | ||
3565 | #else | ||
3566 | void *__kmalloc(size_t size, gfp_t flags) | ||
3567 | { | ||
3568 | return __do_kmalloc(size, flags, 0); | ||
3569 | } | ||
3570 | EXPORT_SYMBOL(__kmalloc); | ||
3571 | #endif | ||
3572 | |||
3573 | /** | 3516 | /** |
3574 | * kmem_cache_free - Deallocate an object | 3517 | * kmem_cache_free - Deallocate an object |
3575 | * @cachep: The cache the allocation was from. | 3518 | * @cachep: The cache the allocation was from. |
@@ -3714,72 +3657,45 @@ fail: | |||
3714 | return -ENOMEM; | 3657 | return -ENOMEM; |
3715 | } | 3658 | } |
3716 | 3659 | ||
3717 | struct ccupdate_struct { | ||
3718 | struct kmem_cache *cachep; | ||
3719 | struct array_cache *new[0]; | ||
3720 | }; | ||
3721 | |||
3722 | static void do_ccupdate_local(void *info) | ||
3723 | { | ||
3724 | struct ccupdate_struct *new = info; | ||
3725 | struct array_cache *old; | ||
3726 | |||
3727 | check_irq_off(); | ||
3728 | old = cpu_cache_get(new->cachep); | ||
3729 | |||
3730 | new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; | ||
3731 | new->new[smp_processor_id()] = old; | ||
3732 | } | ||
3733 | |||
3734 | /* Always called with the slab_mutex held */ | 3660 | /* Always called with the slab_mutex held */ |
3735 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3661 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3736 | int batchcount, int shared, gfp_t gfp) | 3662 | int batchcount, int shared, gfp_t gfp) |
3737 | { | 3663 | { |
3738 | struct ccupdate_struct *new; | 3664 | struct array_cache __percpu *cpu_cache, *prev; |
3739 | int i; | 3665 | int cpu; |
3740 | 3666 | ||
3741 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), | 3667 | cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); |
3742 | gfp); | 3668 | if (!cpu_cache) |
3743 | if (!new) | ||
3744 | return -ENOMEM; | 3669 | return -ENOMEM; |
3745 | 3670 | ||
3746 | for_each_online_cpu(i) { | 3671 | prev = cachep->cpu_cache; |
3747 | new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, | 3672 | cachep->cpu_cache = cpu_cache; |
3748 | batchcount, gfp); | 3673 | kick_all_cpus_sync(); |
3749 | if (!new->new[i]) { | ||
3750 | for (i--; i >= 0; i--) | ||
3751 | kfree(new->new[i]); | ||
3752 | kfree(new); | ||
3753 | return -ENOMEM; | ||
3754 | } | ||
3755 | } | ||
3756 | new->cachep = cachep; | ||
3757 | |||
3758 | on_each_cpu(do_ccupdate_local, (void *)new, 1); | ||
3759 | 3674 | ||
3760 | check_irq_on(); | 3675 | check_irq_on(); |
3761 | cachep->batchcount = batchcount; | 3676 | cachep->batchcount = batchcount; |
3762 | cachep->limit = limit; | 3677 | cachep->limit = limit; |
3763 | cachep->shared = shared; | 3678 | cachep->shared = shared; |
3764 | 3679 | ||
3765 | for_each_online_cpu(i) { | 3680 | if (!prev) |
3681 | goto alloc_node; | ||
3682 | |||
3683 | for_each_online_cpu(cpu) { | ||
3766 | LIST_HEAD(list); | 3684 | LIST_HEAD(list); |
3767 | struct array_cache *ccold = new->new[i]; | ||
3768 | int node; | 3685 | int node; |
3769 | struct kmem_cache_node *n; | 3686 | struct kmem_cache_node *n; |
3687 | struct array_cache *ac = per_cpu_ptr(prev, cpu); | ||
3770 | 3688 | ||
3771 | if (!ccold) | 3689 | node = cpu_to_mem(cpu); |
3772 | continue; | ||
3773 | |||
3774 | node = cpu_to_mem(i); | ||
3775 | n = get_node(cachep, node); | 3690 | n = get_node(cachep, node); |
3776 | spin_lock_irq(&n->list_lock); | 3691 | spin_lock_irq(&n->list_lock); |
3777 | free_block(cachep, ccold->entry, ccold->avail, node, &list); | 3692 | free_block(cachep, ac->entry, ac->avail, node, &list); |
3778 | spin_unlock_irq(&n->list_lock); | 3693 | spin_unlock_irq(&n->list_lock); |
3779 | slabs_destroy(cachep, &list); | 3694 | slabs_destroy(cachep, &list); |
3780 | kfree(ccold); | ||
3781 | } | 3695 | } |
3782 | kfree(new); | 3696 | free_percpu(prev); |
3697 | |||
3698 | alloc_node: | ||
3783 | return alloc_kmem_cache_node(cachep, gfp); | 3699 | return alloc_kmem_cache_node(cachep, gfp); |
3784 | } | 3700 | } |
3785 | 3701 | ||
@@ -4262,19 +4178,15 @@ static const struct seq_operations slabstats_op = { | |||
4262 | 4178 | ||
4263 | static int slabstats_open(struct inode *inode, struct file *file) | 4179 | static int slabstats_open(struct inode *inode, struct file *file) |
4264 | { | 4180 | { |
4265 | unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); | 4181 | unsigned long *n; |
4266 | int ret = -ENOMEM; | 4182 | |
4267 | if (n) { | 4183 | n = __seq_open_private(file, &slabstats_op, PAGE_SIZE); |
4268 | ret = seq_open(file, &slabstats_op); | 4184 | if (!n) |
4269 | if (!ret) { | 4185 | return -ENOMEM; |
4270 | struct seq_file *m = file->private_data; | 4186 | |
4271 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); | 4187 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); |
4272 | m->private = n; | 4188 | |
4273 | n = NULL; | 4189 | return 0; |
4274 | } | ||
4275 | kfree(n); | ||
4276 | } | ||
4277 | return ret; | ||
4278 | } | 4190 | } |
4279 | 4191 | ||
4280 | static const struct file_operations proc_slabstats_operations = { | 4192 | static const struct file_operations proc_slabstats_operations = { |
@@ -4,6 +4,41 @@ | |||
4 | * Internal slab definitions | 4 | * Internal slab definitions |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #ifdef CONFIG_SLOB | ||
8 | /* | ||
9 | * Common fields provided in kmem_cache by all slab allocators | ||
10 | * This struct is either used directly by the allocator (SLOB) | ||
11 | * or the allocator must include definitions for all fields | ||
12 | * provided in kmem_cache_common in their definition of kmem_cache. | ||
13 | * | ||
14 | * Once we can do anonymous structs (C11 standard) we could put a | ||
15 | * anonymous struct definition in these allocators so that the | ||
16 | * separate allocations in the kmem_cache structure of SLAB and | ||
17 | * SLUB is no longer needed. | ||
18 | */ | ||
19 | struct kmem_cache { | ||
20 | unsigned int object_size;/* The original size of the object */ | ||
21 | unsigned int size; /* The aligned/padded/added on size */ | ||
22 | unsigned int align; /* Alignment as calculated */ | ||
23 | unsigned long flags; /* Active flags on the slab */ | ||
24 | const char *name; /* Slab name for sysfs */ | ||
25 | int refcount; /* Use counter */ | ||
26 | void (*ctor)(void *); /* Called on object slot creation */ | ||
27 | struct list_head list; /* List of all slab caches on the system */ | ||
28 | }; | ||
29 | |||
30 | #endif /* CONFIG_SLOB */ | ||
31 | |||
32 | #ifdef CONFIG_SLAB | ||
33 | #include <linux/slab_def.h> | ||
34 | #endif | ||
35 | |||
36 | #ifdef CONFIG_SLUB | ||
37 | #include <linux/slub_def.h> | ||
38 | #endif | ||
39 | |||
40 | #include <linux/memcontrol.h> | ||
41 | |||
7 | /* | 42 | /* |
8 | * State of the slab allocator. | 43 | * State of the slab allocator. |
9 | * | 44 | * |
@@ -15,7 +50,6 @@ | |||
15 | enum slab_state { | 50 | enum slab_state { |
16 | DOWN, /* No slab functionality yet */ | 51 | DOWN, /* No slab functionality yet */ |
17 | PARTIAL, /* SLUB: kmem_cache_node available */ | 52 | PARTIAL, /* SLUB: kmem_cache_node available */ |
18 | PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ | ||
19 | PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ | 53 | PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ |
20 | UP, /* Slab caches usable but not all extras yet */ | 54 | UP, /* Slab caches usable but not all extras yet */ |
21 | FULL /* Everything is working */ | 55 | FULL /* Everything is working */ |
@@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, | |||
53 | size_t size, unsigned long flags); | 87 | size_t size, unsigned long flags); |
54 | 88 | ||
55 | struct mem_cgroup; | 89 | struct mem_cgroup; |
56 | #ifdef CONFIG_SLUB | 90 | |
91 | int slab_unmergeable(struct kmem_cache *s); | ||
92 | struct kmem_cache *find_mergeable(size_t size, size_t align, | ||
93 | unsigned long flags, const char *name, void (*ctor)(void *)); | ||
94 | #ifndef CONFIG_SLOB | ||
57 | struct kmem_cache * | 95 | struct kmem_cache * |
58 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 96 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
59 | unsigned long flags, void (*ctor)(void *)); | 97 | unsigned long flags, void (*ctor)(void *)); |
98 | |||
99 | unsigned long kmem_cache_flags(unsigned long object_size, | ||
100 | unsigned long flags, const char *name, | ||
101 | void (*ctor)(void *)); | ||
60 | #else | 102 | #else |
61 | static inline struct kmem_cache * | 103 | static inline struct kmem_cache * |
62 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 104 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
63 | unsigned long flags, void (*ctor)(void *)) | 105 | unsigned long flags, void (*ctor)(void *)) |
64 | { return NULL; } | 106 | { return NULL; } |
107 | |||
108 | static inline unsigned long kmem_cache_flags(unsigned long object_size, | ||
109 | unsigned long flags, const char *name, | ||
110 | void (*ctor)(void *)) | ||
111 | { | ||
112 | return flags; | ||
113 | } | ||
65 | #endif | 114 | #endif |
66 | 115 | ||
67 | 116 | ||
@@ -303,8 +352,8 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
303 | * a kmem_cache_node structure allocated (which is true for all online nodes) | 352 | * a kmem_cache_node structure allocated (which is true for all online nodes) |
304 | */ | 353 | */ |
305 | #define for_each_kmem_cache_node(__s, __node, __n) \ | 354 | #define for_each_kmem_cache_node(__s, __node, __n) \ |
306 | for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ | 355 | for (__node = 0; __node < nr_node_ids; __node++) \ |
307 | if (__n) | 356 | if ((__n = get_node(__s, __node))) |
308 | 357 | ||
309 | #endif | 358 | #endif |
310 | 359 | ||
diff --git a/mm/slab_common.c b/mm/slab_common.c index d319502b2403..3a6e0cfdf03a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -30,6 +30,43 @@ LIST_HEAD(slab_caches); | |||
30 | DEFINE_MUTEX(slab_mutex); | 30 | DEFINE_MUTEX(slab_mutex); |
31 | struct kmem_cache *kmem_cache; | 31 | struct kmem_cache *kmem_cache; |
32 | 32 | ||
33 | /* | ||
34 | * Set of flags that will prevent slab merging | ||
35 | */ | ||
36 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
37 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | ||
38 | SLAB_FAILSLAB) | ||
39 | |||
40 | #define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | ||
41 | SLAB_CACHE_DMA | SLAB_NOTRACK) | ||
42 | |||
43 | /* | ||
44 | * Merge control. If this is set then no merging of slab caches will occur. | ||
45 | * (Could be removed. This was introduced to pacify the merge skeptics.) | ||
46 | */ | ||
47 | static int slab_nomerge; | ||
48 | |||
49 | static int __init setup_slab_nomerge(char *str) | ||
50 | { | ||
51 | slab_nomerge = 1; | ||
52 | return 1; | ||
53 | } | ||
54 | |||
55 | #ifdef CONFIG_SLUB | ||
56 | __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); | ||
57 | #endif | ||
58 | |||
59 | __setup("slab_nomerge", setup_slab_nomerge); | ||
60 | |||
61 | /* | ||
62 | * Determine the size of a slab object | ||
63 | */ | ||
64 | unsigned int kmem_cache_size(struct kmem_cache *s) | ||
65 | { | ||
66 | return s->object_size; | ||
67 | } | ||
68 | EXPORT_SYMBOL(kmem_cache_size); | ||
69 | |||
33 | #ifdef CONFIG_DEBUG_VM | 70 | #ifdef CONFIG_DEBUG_VM |
34 | static int kmem_cache_sanity_check(const char *name, size_t size) | 71 | static int kmem_cache_sanity_check(const char *name, size_t size) |
35 | { | 72 | { |
@@ -79,6 +116,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) | |||
79 | #endif | 116 | #endif |
80 | 117 | ||
81 | #ifdef CONFIG_MEMCG_KMEM | 118 | #ifdef CONFIG_MEMCG_KMEM |
119 | static int memcg_alloc_cache_params(struct mem_cgroup *memcg, | ||
120 | struct kmem_cache *s, struct kmem_cache *root_cache) | ||
121 | { | ||
122 | size_t size; | ||
123 | |||
124 | if (!memcg_kmem_enabled()) | ||
125 | return 0; | ||
126 | |||
127 | if (!memcg) { | ||
128 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
129 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
130 | } else | ||
131 | size = sizeof(struct memcg_cache_params); | ||
132 | |||
133 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
134 | if (!s->memcg_params) | ||
135 | return -ENOMEM; | ||
136 | |||
137 | if (memcg) { | ||
138 | s->memcg_params->memcg = memcg; | ||
139 | s->memcg_params->root_cache = root_cache; | ||
140 | } else | ||
141 | s->memcg_params->is_root_cache = true; | ||
142 | |||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | static void memcg_free_cache_params(struct kmem_cache *s) | ||
147 | { | ||
148 | kfree(s->memcg_params); | ||
149 | } | ||
150 | |||
151 | static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) | ||
152 | { | ||
153 | int size; | ||
154 | struct memcg_cache_params *new_params, *cur_params; | ||
155 | |||
156 | BUG_ON(!is_root_cache(s)); | ||
157 | |||
158 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
159 | size += num_memcgs * sizeof(void *); | ||
160 | |||
161 | new_params = kzalloc(size, GFP_KERNEL); | ||
162 | if (!new_params) | ||
163 | return -ENOMEM; | ||
164 | |||
165 | cur_params = s->memcg_params; | ||
166 | memcpy(new_params->memcg_caches, cur_params->memcg_caches, | ||
167 | memcg_limited_groups_array_size * sizeof(void *)); | ||
168 | |||
169 | new_params->is_root_cache = true; | ||
170 | |||
171 | rcu_assign_pointer(s->memcg_params, new_params); | ||
172 | if (cur_params) | ||
173 | kfree_rcu(cur_params, rcu_head); | ||
174 | |||
175 | return 0; | ||
176 | } | ||
177 | |||
82 | int memcg_update_all_caches(int num_memcgs) | 178 | int memcg_update_all_caches(int num_memcgs) |
83 | { | 179 | { |
84 | struct kmem_cache *s; | 180 | struct kmem_cache *s; |
@@ -89,9 +185,8 @@ int memcg_update_all_caches(int num_memcgs) | |||
89 | if (!is_root_cache(s)) | 185 | if (!is_root_cache(s)) |
90 | continue; | 186 | continue; |
91 | 187 | ||
92 | ret = memcg_update_cache_size(s, num_memcgs); | 188 | ret = memcg_update_cache_params(s, num_memcgs); |
93 | /* | 189 | /* |
94 | * See comment in memcontrol.c, memcg_update_cache_size: | ||
95 | * Instead of freeing the memory, we'll just leave the caches | 190 | * Instead of freeing the memory, we'll just leave the caches |
96 | * up to this point in an updated state. | 191 | * up to this point in an updated state. |
97 | */ | 192 | */ |
@@ -104,7 +199,80 @@ out: | |||
104 | mutex_unlock(&slab_mutex); | 199 | mutex_unlock(&slab_mutex); |
105 | return ret; | 200 | return ret; |
106 | } | 201 | } |
107 | #endif | 202 | #else |
203 | static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, | ||
204 | struct kmem_cache *s, struct kmem_cache *root_cache) | ||
205 | { | ||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | static inline void memcg_free_cache_params(struct kmem_cache *s) | ||
210 | { | ||
211 | } | ||
212 | #endif /* CONFIG_MEMCG_KMEM */ | ||
213 | |||
214 | /* | ||
215 | * Find a mergeable slab cache | ||
216 | */ | ||
217 | int slab_unmergeable(struct kmem_cache *s) | ||
218 | { | ||
219 | if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE)) | ||
220 | return 1; | ||
221 | |||
222 | if (!is_root_cache(s)) | ||
223 | return 1; | ||
224 | |||
225 | if (s->ctor) | ||
226 | return 1; | ||
227 | |||
228 | /* | ||
229 | * We may have set a slab to be unmergeable during bootstrap. | ||
230 | */ | ||
231 | if (s->refcount < 0) | ||
232 | return 1; | ||
233 | |||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct kmem_cache *find_mergeable(size_t size, size_t align, | ||
238 | unsigned long flags, const char *name, void (*ctor)(void *)) | ||
239 | { | ||
240 | struct kmem_cache *s; | ||
241 | |||
242 | if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) | ||
243 | return NULL; | ||
244 | |||
245 | if (ctor) | ||
246 | return NULL; | ||
247 | |||
248 | size = ALIGN(size, sizeof(void *)); | ||
249 | align = calculate_alignment(flags, align, size); | ||
250 | size = ALIGN(size, align); | ||
251 | flags = kmem_cache_flags(size, flags, name, NULL); | ||
252 | |||
253 | list_for_each_entry(s, &slab_caches, list) { | ||
254 | if (slab_unmergeable(s)) | ||
255 | continue; | ||
256 | |||
257 | if (size > s->size) | ||
258 | continue; | ||
259 | |||
260 | if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) | ||
261 | continue; | ||
262 | /* | ||
263 | * Check if alignment is compatible. | ||
264 | * Courtesy of Adrian Drzewiecki | ||
265 | */ | ||
266 | if ((s->size & ~(align - 1)) != s->size) | ||
267 | continue; | ||
268 | |||
269 | if (s->size - size >= sizeof(void *)) | ||
270 | continue; | ||
271 | |||
272 | return s; | ||
273 | } | ||
274 | return NULL; | ||
275 | } | ||
108 | 276 | ||
109 | /* | 277 | /* |
110 | * Figure out what the alignment of the objects will be given a set of | 278 | * Figure out what the alignment of the objects will be given a set of |
@@ -211,8 +379,10 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
211 | mutex_lock(&slab_mutex); | 379 | mutex_lock(&slab_mutex); |
212 | 380 | ||
213 | err = kmem_cache_sanity_check(name, size); | 381 | err = kmem_cache_sanity_check(name, size); |
214 | if (err) | 382 | if (err) { |
383 | s = NULL; /* suppress uninit var warning */ | ||
215 | goto out_unlock; | 384 | goto out_unlock; |
385 | } | ||
216 | 386 | ||
217 | /* | 387 | /* |
218 | * Some allocators will constraint the set of valid flags to a subset | 388 | * Some allocators will constraint the set of valid flags to a subset |
@@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp) | |||
468 | } | 468 | } |
469 | EXPORT_SYMBOL(__kmalloc); | 469 | EXPORT_SYMBOL(__kmalloc); |
470 | 470 | ||
471 | #ifdef CONFIG_TRACING | ||
472 | void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) | 471 | void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) |
473 | { | 472 | { |
474 | return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); | 473 | return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); |
@@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, | |||
481 | return __do_kmalloc_node(size, gfp, node, caller); | 480 | return __do_kmalloc_node(size, gfp, node, caller); |
482 | } | 481 | } |
483 | #endif | 482 | #endif |
484 | #endif | ||
485 | 483 | ||
486 | void kfree(const void *block) | 484 | void kfree(const void *block) |
487 | { | 485 | { |
@@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) | |||
169 | */ | 169 | */ |
170 | #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | 170 | #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) |
171 | 171 | ||
172 | /* | ||
173 | * Set of flags that will prevent slab merging | ||
174 | */ | ||
175 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
176 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | ||
177 | SLAB_FAILSLAB) | ||
178 | |||
179 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | ||
180 | SLAB_CACHE_DMA | SLAB_NOTRACK) | ||
181 | |||
182 | #define OO_SHIFT 16 | 172 | #define OO_SHIFT 16 |
183 | #define OO_MASK ((1 << OO_SHIFT) - 1) | 173 | #define OO_MASK ((1 << OO_SHIFT) - 1) |
184 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ | 174 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
@@ -1176,7 +1166,7 @@ out: | |||
1176 | 1166 | ||
1177 | __setup("slub_debug", setup_slub_debug); | 1167 | __setup("slub_debug", setup_slub_debug); |
1178 | 1168 | ||
1179 | static unsigned long kmem_cache_flags(unsigned long object_size, | 1169 | unsigned long kmem_cache_flags(unsigned long object_size, |
1180 | unsigned long flags, const char *name, | 1170 | unsigned long flags, const char *name, |
1181 | void (*ctor)(void *)) | 1171 | void (*ctor)(void *)) |
1182 | { | 1172 | { |
@@ -1208,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, | |||
1208 | struct page *page) {} | 1198 | struct page *page) {} |
1209 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, | 1199 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1210 | struct page *page) {} | 1200 | struct page *page) {} |
1211 | static inline unsigned long kmem_cache_flags(unsigned long object_size, | 1201 | unsigned long kmem_cache_flags(unsigned long object_size, |
1212 | unsigned long flags, const char *name, | 1202 | unsigned long flags, const char *name, |
1213 | void (*ctor)(void *)) | 1203 | void (*ctor)(void *)) |
1214 | { | 1204 | { |
@@ -1699,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, | |||
1699 | struct kmem_cache_cpu *c) | 1689 | struct kmem_cache_cpu *c) |
1700 | { | 1690 | { |
1701 | void *object; | 1691 | void *object; |
1702 | int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; | 1692 | int searchnode = node; |
1693 | |||
1694 | if (node == NUMA_NO_NODE) | ||
1695 | searchnode = numa_mem_id(); | ||
1696 | else if (!node_present_pages(node)) | ||
1697 | searchnode = node_to_mem_node(node); | ||
1703 | 1698 | ||
1704 | object = get_partial_node(s, get_node(s, searchnode), c, flags); | 1699 | object = get_partial_node(s, get_node(s, searchnode), c, flags); |
1705 | if (object || node != NUMA_NO_NODE) | 1700 | if (object || node != NUMA_NO_NODE) |
@@ -2280,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
2280 | redo: | 2275 | redo: |
2281 | 2276 | ||
2282 | if (unlikely(!node_match(page, node))) { | 2277 | if (unlikely(!node_match(page, node))) { |
2283 | stat(s, ALLOC_NODE_MISMATCH); | 2278 | int searchnode = node; |
2284 | deactivate_slab(s, page, c->freelist); | 2279 | |
2285 | c->page = NULL; | 2280 | if (node != NUMA_NO_NODE && !node_present_pages(node)) |
2286 | c->freelist = NULL; | 2281 | searchnode = node_to_mem_node(node); |
2287 | goto new_slab; | 2282 | |
2283 | if (unlikely(!node_match(page, searchnode))) { | ||
2284 | stat(s, ALLOC_NODE_MISMATCH); | ||
2285 | deactivate_slab(s, page, c->freelist); | ||
2286 | c->page = NULL; | ||
2287 | c->freelist = NULL; | ||
2288 | goto new_slab; | ||
2289 | } | ||
2288 | } | 2290 | } |
2289 | 2291 | ||
2290 | /* | 2292 | /* |
@@ -2707,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; | |||
2707 | static int slub_min_objects; | 2709 | static int slub_min_objects; |
2708 | 2710 | ||
2709 | /* | 2711 | /* |
2710 | * Merge control. If this is set then no merging of slab caches will occur. | ||
2711 | * (Could be removed. This was introduced to pacify the merge skeptics.) | ||
2712 | */ | ||
2713 | static int slub_nomerge; | ||
2714 | |||
2715 | /* | ||
2716 | * Calculate the order of allocation given an slab object size. | 2712 | * Calculate the order of allocation given an slab object size. |
2717 | * | 2713 | * |
2718 | * The order of allocation has significant impact on performance and other | 2714 | * The order of allocation has significant impact on performance and other |
@@ -3240,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str) | |||
3240 | 3236 | ||
3241 | __setup("slub_min_objects=", setup_slub_min_objects); | 3237 | __setup("slub_min_objects=", setup_slub_min_objects); |
3242 | 3238 | ||
3243 | static int __init setup_slub_nomerge(char *str) | ||
3244 | { | ||
3245 | slub_nomerge = 1; | ||
3246 | return 1; | ||
3247 | } | ||
3248 | |||
3249 | __setup("slub_nomerge", setup_slub_nomerge); | ||
3250 | |||
3251 | void *__kmalloc(size_t size, gfp_t flags) | 3239 | void *__kmalloc(size_t size, gfp_t flags) |
3252 | { | 3240 | { |
3253 | struct kmem_cache *s; | 3241 | struct kmem_cache *s; |
@@ -3625,69 +3613,6 @@ void __init kmem_cache_init_late(void) | |||
3625 | { | 3613 | { |
3626 | } | 3614 | } |
3627 | 3615 | ||
3628 | /* | ||
3629 | * Find a mergeable slab cache | ||
3630 | */ | ||
3631 | static int slab_unmergeable(struct kmem_cache *s) | ||
3632 | { | ||
3633 | if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) | ||
3634 | return 1; | ||
3635 | |||
3636 | if (!is_root_cache(s)) | ||
3637 | return 1; | ||
3638 | |||
3639 | if (s->ctor) | ||
3640 | return 1; | ||
3641 | |||
3642 | /* | ||
3643 | * We may have set a slab to be unmergeable during bootstrap. | ||
3644 | */ | ||
3645 | if (s->refcount < 0) | ||
3646 | return 1; | ||
3647 | |||
3648 | return 0; | ||
3649 | } | ||
3650 | |||
3651 | static struct kmem_cache *find_mergeable(size_t size, size_t align, | ||
3652 | unsigned long flags, const char *name, void (*ctor)(void *)) | ||
3653 | { | ||
3654 | struct kmem_cache *s; | ||
3655 | |||
3656 | if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) | ||
3657 | return NULL; | ||
3658 | |||
3659 | if (ctor) | ||
3660 | return NULL; | ||
3661 | |||
3662 | size = ALIGN(size, sizeof(void *)); | ||
3663 | align = calculate_alignment(flags, align, size); | ||
3664 | size = ALIGN(size, align); | ||
3665 | flags = kmem_cache_flags(size, flags, name, NULL); | ||
3666 | |||
3667 | list_for_each_entry(s, &slab_caches, list) { | ||
3668 | if (slab_unmergeable(s)) | ||
3669 | continue; | ||
3670 | |||
3671 | if (size > s->size) | ||
3672 | continue; | ||
3673 | |||
3674 | if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) | ||
3675 | continue; | ||
3676 | /* | ||
3677 | * Check if alignment is compatible. | ||
3678 | * Courtesy of Adrian Drzewiecki | ||
3679 | */ | ||
3680 | if ((s->size & ~(align - 1)) != s->size) | ||
3681 | continue; | ||
3682 | |||
3683 | if (s->size - size >= sizeof(void *)) | ||
3684 | continue; | ||
3685 | |||
3686 | return s; | ||
3687 | } | ||
3688 | return NULL; | ||
3689 | } | ||
3690 | |||
3691 | struct kmem_cache * | 3616 | struct kmem_cache * |
3692 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 3617 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
3693 | unsigned long flags, void (*ctor)(void *)) | 3618 | unsigned long flags, void (*ctor)(void *)) |
@@ -4604,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf) | |||
4604 | static ssize_t trace_store(struct kmem_cache *s, const char *buf, | 4529 | static ssize_t trace_store(struct kmem_cache *s, const char *buf, |
4605 | size_t length) | 4530 | size_t length) |
4606 | { | 4531 | { |
4532 | /* | ||
4533 | * Tracing a merged cache is going to give confusing results | ||
4534 | * as well as cause other issues like converting a mergeable | ||
4535 | * cache into an umergeable one. | ||
4536 | */ | ||
4537 | if (s->refcount > 1) | ||
4538 | return -EINVAL; | ||
4539 | |||
4607 | s->flags &= ~SLAB_TRACE; | 4540 | s->flags &= ~SLAB_TRACE; |
4608 | if (buf[0] == '1') { | 4541 | if (buf[0] == '1') { |
4609 | s->flags &= ~__CMPXCHG_DOUBLE; | 4542 | s->flags &= ~__CMPXCHG_DOUBLE; |
@@ -4721,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) | |||
4721 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, | 4654 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, |
4722 | size_t length) | 4655 | size_t length) |
4723 | { | 4656 | { |
4657 | if (s->refcount > 1) | ||
4658 | return -EINVAL; | ||
4659 | |||
4724 | s->flags &= ~SLAB_FAILSLAB; | 4660 | s->flags &= ~SLAB_FAILSLAB; |
4725 | if (buf[0] == '1') | 4661 | if (buf[0] == '1') |
4726 | s->flags |= SLAB_FAILSLAB; | 4662 | s->flags |= SLAB_FAILSLAB; |
@@ -887,18 +887,14 @@ void lru_add_drain_all(void) | |||
887 | mutex_unlock(&lock); | 887 | mutex_unlock(&lock); |
888 | } | 888 | } |
889 | 889 | ||
890 | /* | 890 | /** |
891 | * Batched page_cache_release(). Decrement the reference count on all the | 891 | * release_pages - batched page_cache_release() |
892 | * passed pages. If it fell to zero then remove the page from the LRU and | 892 | * @pages: array of pages to release |
893 | * free it. | 893 | * @nr: number of pages |
894 | * | 894 | * @cold: whether the pages are cache cold |
895 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it | ||
896 | * for the remainder of the operation. | ||
897 | * | 895 | * |
898 | * The locking in this function is against shrink_inactive_list(): we recheck | 896 | * Decrement the reference count on all the pages in @pages. If it |
899 | * the page count inside the lock to see whether shrink_inactive_list() | 897 | * fell to zero, remove the page from the LRU and free it. |
900 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() | ||
901 | * will free it. | ||
902 | */ | 898 | */ |
903 | void release_pages(struct page **pages, int nr, bool cold) | 899 | void release_pages(struct page **pages, int nr, bool cold) |
904 | { | 900 | { |
@@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
907 | struct zone *zone = NULL; | 903 | struct zone *zone = NULL; |
908 | struct lruvec *lruvec; | 904 | struct lruvec *lruvec; |
909 | unsigned long uninitialized_var(flags); | 905 | unsigned long uninitialized_var(flags); |
906 | unsigned int uninitialized_var(lock_batch); | ||
910 | 907 | ||
911 | for (i = 0; i < nr; i++) { | 908 | for (i = 0; i < nr; i++) { |
912 | struct page *page = pages[i]; | 909 | struct page *page = pages[i]; |
@@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
920 | continue; | 917 | continue; |
921 | } | 918 | } |
922 | 919 | ||
920 | /* | ||
921 | * Make sure the IRQ-safe lock-holding time does not get | ||
922 | * excessive with a continuous string of pages from the | ||
923 | * same zone. The lock is held only if zone != NULL. | ||
924 | */ | ||
925 | if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { | ||
926 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
927 | zone = NULL; | ||
928 | } | ||
929 | |||
923 | if (!put_page_testzero(page)) | 930 | if (!put_page_testzero(page)) |
924 | continue; | 931 | continue; |
925 | 932 | ||
@@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
930 | if (zone) | 937 | if (zone) |
931 | spin_unlock_irqrestore(&zone->lru_lock, | 938 | spin_unlock_irqrestore(&zone->lru_lock, |
932 | flags); | 939 | flags); |
940 | lock_batch = 0; | ||
933 | zone = pagezone; | 941 | zone = pagezone; |
934 | spin_lock_irqsave(&zone->lru_lock, flags); | 942 | spin_lock_irqsave(&zone->lru_lock, flags); |
935 | } | 943 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 3e0ec83d000c..154444918685 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -28,7 +28,9 @@ | |||
28 | static const struct address_space_operations swap_aops = { | 28 | static const struct address_space_operations swap_aops = { |
29 | .writepage = swap_writepage, | 29 | .writepage = swap_writepage, |
30 | .set_page_dirty = swap_set_page_dirty, | 30 | .set_page_dirty = swap_set_page_dirty, |
31 | #ifdef CONFIG_MIGRATION | ||
31 | .migratepage = migrate_page, | 32 | .migratepage = migrate_page, |
33 | #endif | ||
32 | }; | 34 | }; |
33 | 35 | ||
34 | static struct backing_dev_info swap_backing_dev_info = { | 36 | static struct backing_dev_info swap_backing_dev_info = { |
@@ -263,18 +265,12 @@ void free_page_and_swap_cache(struct page *page) | |||
263 | void free_pages_and_swap_cache(struct page **pages, int nr) | 265 | void free_pages_and_swap_cache(struct page **pages, int nr) |
264 | { | 266 | { |
265 | struct page **pagep = pages; | 267 | struct page **pagep = pages; |
268 | int i; | ||
266 | 269 | ||
267 | lru_add_drain(); | 270 | lru_add_drain(); |
268 | while (nr) { | 271 | for (i = 0; i < nr; i++) |
269 | int todo = min(nr, PAGEVEC_SIZE); | 272 | free_swap_cache(pagep[i]); |
270 | int i; | 273 | release_pages(pagep, nr, false); |
271 | |||
272 | for (i = 0; i < todo; i++) | ||
273 | free_swap_cache(pagep[i]); | ||
274 | release_pages(pagep, todo, false); | ||
275 | pagep += todo; | ||
276 | nr -= todo; | ||
277 | } | ||
278 | } | 274 | } |
279 | 275 | ||
280 | /* | 276 | /* |
@@ -170,32 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t, | |||
170 | /* | 170 | /* |
171 | * Check if the vma is being used as a stack. | 171 | * Check if the vma is being used as a stack. |
172 | * If is_group is non-zero, check in the entire thread group or else | 172 | * If is_group is non-zero, check in the entire thread group or else |
173 | * just check in the current task. Returns the pid of the task that | 173 | * just check in the current task. Returns the task_struct of the task |
174 | * the vma is stack for. | 174 | * that the vma is stack for. Must be called under rcu_read_lock(). |
175 | */ | 175 | */ |
176 | pid_t vm_is_stack(struct task_struct *task, | 176 | struct task_struct *task_of_stack(struct task_struct *task, |
177 | struct vm_area_struct *vma, int in_group) | 177 | struct vm_area_struct *vma, bool in_group) |
178 | { | 178 | { |
179 | pid_t ret = 0; | ||
180 | |||
181 | if (vm_is_stack_for_task(task, vma)) | 179 | if (vm_is_stack_for_task(task, vma)) |
182 | return task->pid; | 180 | return task; |
183 | 181 | ||
184 | if (in_group) { | 182 | if (in_group) { |
185 | struct task_struct *t; | 183 | struct task_struct *t; |
186 | 184 | ||
187 | rcu_read_lock(); | ||
188 | for_each_thread(task, t) { | 185 | for_each_thread(task, t) { |
189 | if (vm_is_stack_for_task(t, vma)) { | 186 | if (vm_is_stack_for_task(t, vma)) |
190 | ret = t->pid; | 187 | return t; |
191 | goto done; | ||
192 | } | ||
193 | } | 188 | } |
194 | done: | ||
195 | rcu_read_unlock(); | ||
196 | } | 189 | } |
197 | 190 | ||
198 | return ret; | 191 | return NULL; |
199 | } | 192 | } |
200 | 193 | ||
201 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | 194 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2b0aa5486092..90520af7f186 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = { | |||
2646 | 2646 | ||
2647 | static int vmalloc_open(struct inode *inode, struct file *file) | 2647 | static int vmalloc_open(struct inode *inode, struct file *file) |
2648 | { | 2648 | { |
2649 | unsigned int *ptr = NULL; | 2649 | if (IS_ENABLED(CONFIG_NUMA)) |
2650 | int ret; | 2650 | return seq_open_private(file, &vmalloc_op, |
2651 | 2651 | nr_node_ids * sizeof(unsigned int)); | |
2652 | if (IS_ENABLED(CONFIG_NUMA)) { | 2652 | else |
2653 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | 2653 | return seq_open(file, &vmalloc_op); |
2654 | if (ptr == NULL) | ||
2655 | return -ENOMEM; | ||
2656 | } | ||
2657 | ret = seq_open(file, &vmalloc_op); | ||
2658 | if (!ret) { | ||
2659 | struct seq_file *m = file->private_data; | ||
2660 | m->private = ptr; | ||
2661 | } else | ||
2662 | kfree(ptr); | ||
2663 | return ret; | ||
2664 | } | 2654 | } |
2665 | 2655 | ||
2666 | static const struct file_operations proc_vmalloc_operations = { | 2656 | static const struct file_operations proc_vmalloc_operations = { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 2836b5373b2e..dcb47074ae03 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
920 | /* Case 1 above */ | 920 | /* Case 1 above */ |
921 | if (current_is_kswapd() && | 921 | if (current_is_kswapd() && |
922 | PageReclaim(page) && | 922 | PageReclaim(page) && |
923 | zone_is_reclaim_writeback(zone)) { | 923 | test_bit(ZONE_WRITEBACK, &zone->flags)) { |
924 | nr_immediate++; | 924 | nr_immediate++; |
925 | goto keep_locked; | 925 | goto keep_locked; |
926 | 926 | ||
@@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1002 | */ | 1002 | */ |
1003 | if (page_is_file_cache(page) && | 1003 | if (page_is_file_cache(page) && |
1004 | (!current_is_kswapd() || | 1004 | (!current_is_kswapd() || |
1005 | !zone_is_reclaim_dirty(zone))) { | 1005 | !test_bit(ZONE_DIRTY, &zone->flags))) { |
1006 | /* | 1006 | /* |
1007 | * Immediately reclaim when written back. | 1007 | * Immediately reclaim when written back. |
1008 | * Similar in principal to deactivate_page() | 1008 | * Similar in principal to deactivate_page() |
@@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1563 | * are encountered in the nr_immediate check below. | 1563 | * are encountered in the nr_immediate check below. |
1564 | */ | 1564 | */ |
1565 | if (nr_writeback && nr_writeback == nr_taken) | 1565 | if (nr_writeback && nr_writeback == nr_taken) |
1566 | zone_set_flag(zone, ZONE_WRITEBACK); | 1566 | set_bit(ZONE_WRITEBACK, &zone->flags); |
1567 | 1567 | ||
1568 | /* | 1568 | /* |
1569 | * memcg will stall in page writeback so only consider forcibly | 1569 | * memcg will stall in page writeback so only consider forcibly |
@@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1575 | * backed by a congested BDI and wait_iff_congested will stall. | 1575 | * backed by a congested BDI and wait_iff_congested will stall. |
1576 | */ | 1576 | */ |
1577 | if (nr_dirty && nr_dirty == nr_congested) | 1577 | if (nr_dirty && nr_dirty == nr_congested) |
1578 | zone_set_flag(zone, ZONE_CONGESTED); | 1578 | set_bit(ZONE_CONGESTED, &zone->flags); |
1579 | 1579 | ||
1580 | /* | 1580 | /* |
1581 | * If dirty pages are scanned that are not queued for IO, it | 1581 | * If dirty pages are scanned that are not queued for IO, it |
1582 | * implies that flushers are not keeping up. In this case, flag | 1582 | * implies that flushers are not keeping up. In this case, flag |
1583 | * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing | 1583 | * the zone ZONE_DIRTY and kswapd will start writing pages from |
1584 | * pages from reclaim context. | 1584 | * reclaim context. |
1585 | */ | 1585 | */ |
1586 | if (nr_unqueued_dirty == nr_taken) | 1586 | if (nr_unqueued_dirty == nr_taken) |
1587 | zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); | 1587 | set_bit(ZONE_DIRTY, &zone->flags); |
1588 | 1588 | ||
1589 | /* | 1589 | /* |
1590 | * If kswapd scans pages marked marked for immediate | 1590 | * If kswapd scans pages marked marked for immediate |
@@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2315 | return reclaimable; | 2315 | return reclaimable; |
2316 | } | 2316 | } |
2317 | 2317 | ||
2318 | /* Returns true if compaction should go ahead for a high-order request */ | 2318 | /* |
2319 | * Returns true if compaction should go ahead for a high-order request, or | ||
2320 | * the high-order allocation would succeed without compaction. | ||
2321 | */ | ||
2319 | static inline bool compaction_ready(struct zone *zone, int order) | 2322 | static inline bool compaction_ready(struct zone *zone, int order) |
2320 | { | 2323 | { |
2321 | unsigned long balance_gap, watermark; | 2324 | unsigned long balance_gap, watermark; |
@@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order) | |||
2339 | if (compaction_deferred(zone, order)) | 2342 | if (compaction_deferred(zone, order)) |
2340 | return watermark_ok; | 2343 | return watermark_ok; |
2341 | 2344 | ||
2342 | /* If compaction is not ready to start, keep reclaiming */ | 2345 | /* |
2343 | if (!compaction_suitable(zone, order)) | 2346 | * If compaction is not ready to start and allocation is not likely |
2347 | * to succeed without it, then keep reclaiming. | ||
2348 | */ | ||
2349 | if (compaction_suitable(zone, order) == COMPACT_SKIPPED) | ||
2344 | return false; | 2350 | return false; |
2345 | 2351 | ||
2346 | return watermark_ok; | 2352 | return watermark_ok; |
@@ -2753,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2753 | } | 2759 | } |
2754 | 2760 | ||
2755 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | 2761 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, |
2762 | unsigned long nr_pages, | ||
2756 | gfp_t gfp_mask, | 2763 | gfp_t gfp_mask, |
2757 | bool noswap) | 2764 | bool may_swap) |
2758 | { | 2765 | { |
2759 | struct zonelist *zonelist; | 2766 | struct zonelist *zonelist; |
2760 | unsigned long nr_reclaimed; | 2767 | unsigned long nr_reclaimed; |
2761 | int nid; | 2768 | int nid; |
2762 | struct scan_control sc = { | 2769 | struct scan_control sc = { |
2763 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2770 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
2764 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2771 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2765 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2772 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
2766 | .target_mem_cgroup = memcg, | 2773 | .target_mem_cgroup = memcg, |
2767 | .priority = DEF_PRIORITY, | 2774 | .priority = DEF_PRIORITY, |
2768 | .may_writepage = !laptop_mode, | 2775 | .may_writepage = !laptop_mode, |
2769 | .may_unmap = 1, | 2776 | .may_unmap = 1, |
2770 | .may_swap = !noswap, | 2777 | .may_swap = may_swap, |
2771 | }; | 2778 | }; |
2772 | 2779 | ||
2773 | /* | 2780 | /* |
@@ -2818,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2818 | return false; | 2825 | return false; |
2819 | 2826 | ||
2820 | if (IS_ENABLED(CONFIG_COMPACTION) && order && | 2827 | if (IS_ENABLED(CONFIG_COMPACTION) && order && |
2821 | !compaction_suitable(zone, order)) | 2828 | compaction_suitable(zone, order) == COMPACT_SKIPPED) |
2822 | return false; | 2829 | return false; |
2823 | 2830 | ||
2824 | return true; | 2831 | return true; |
@@ -2978,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2978 | /* Account for the number of pages attempted to reclaim */ | 2985 | /* Account for the number of pages attempted to reclaim */ |
2979 | *nr_attempted += sc->nr_to_reclaim; | 2986 | *nr_attempted += sc->nr_to_reclaim; |
2980 | 2987 | ||
2981 | zone_clear_flag(zone, ZONE_WRITEBACK); | 2988 | clear_bit(ZONE_WRITEBACK, &zone->flags); |
2982 | 2989 | ||
2983 | /* | 2990 | /* |
2984 | * If a zone reaches its high watermark, consider it to be no longer | 2991 | * If a zone reaches its high watermark, consider it to be no longer |
@@ -2988,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2988 | */ | 2995 | */ |
2989 | if (zone_reclaimable(zone) && | 2996 | if (zone_reclaimable(zone) && |
2990 | zone_balanced(zone, testorder, 0, classzone_idx)) { | 2997 | zone_balanced(zone, testorder, 0, classzone_idx)) { |
2991 | zone_clear_flag(zone, ZONE_CONGESTED); | 2998 | clear_bit(ZONE_CONGESTED, &zone->flags); |
2992 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | 2999 | clear_bit(ZONE_DIRTY, &zone->flags); |
2993 | } | 3000 | } |
2994 | 3001 | ||
2995 | return sc->nr_scanned >= sc->nr_to_reclaim; | 3002 | return sc->nr_scanned >= sc->nr_to_reclaim; |
@@ -3080,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3080 | * If balanced, clear the dirty and congested | 3087 | * If balanced, clear the dirty and congested |
3081 | * flags | 3088 | * flags |
3082 | */ | 3089 | */ |
3083 | zone_clear_flag(zone, ZONE_CONGESTED); | 3090 | clear_bit(ZONE_CONGESTED, &zone->flags); |
3084 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | 3091 | clear_bit(ZONE_DIRTY, &zone->flags); |
3085 | } | 3092 | } |
3086 | } | 3093 | } |
3087 | 3094 | ||
@@ -3708,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3708 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 3715 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
3709 | return ZONE_RECLAIM_NOSCAN; | 3716 | return ZONE_RECLAIM_NOSCAN; |
3710 | 3717 | ||
3711 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 3718 | if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) |
3712 | return ZONE_RECLAIM_NOSCAN; | 3719 | return ZONE_RECLAIM_NOSCAN; |
3713 | 3720 | ||
3714 | ret = __zone_reclaim(zone, gfp_mask, order); | 3721 | ret = __zone_reclaim(zone, gfp_mask, order); |
3715 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 3722 | clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); |
3716 | 3723 | ||
3717 | if (!ret) | 3724 | if (!ret) |
3718 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | 3725 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); |
@@ -3791,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3791 | } | 3798 | } |
3792 | } | 3799 | } |
3793 | #endif /* CONFIG_SHMEM */ | 3800 | #endif /* CONFIG_SHMEM */ |
3794 | |||
3795 | static void warn_scan_unevictable_pages(void) | ||
3796 | { | ||
3797 | printk_once(KERN_WARNING | ||
3798 | "%s: The scan_unevictable_pages sysctl/node-interface has been " | ||
3799 | "disabled for lack of a legitimate use case. If you have " | ||
3800 | "one, please send an email to linux-mm@kvack.org.\n", | ||
3801 | current->comm); | ||
3802 | } | ||
3803 | |||
3804 | /* | ||
3805 | * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of | ||
3806 | * all nodes' unevictable lists for evictable pages | ||
3807 | */ | ||
3808 | unsigned long scan_unevictable_pages; | ||
3809 | |||
3810 | int scan_unevictable_handler(struct ctl_table *table, int write, | ||
3811 | void __user *buffer, | ||
3812 | size_t *length, loff_t *ppos) | ||
3813 | { | ||
3814 | warn_scan_unevictable_pages(); | ||
3815 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | ||
3816 | scan_unevictable_pages = 0; | ||
3817 | return 0; | ||
3818 | } | ||
3819 | |||
3820 | #ifdef CONFIG_NUMA | ||
3821 | /* | ||
3822 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of | ||
3823 | * a specified node's per zone unevictable lists for evictable pages. | ||
3824 | */ | ||
3825 | |||
3826 | static ssize_t read_scan_unevictable_node(struct device *dev, | ||
3827 | struct device_attribute *attr, | ||
3828 | char *buf) | ||
3829 | { | ||
3830 | warn_scan_unevictable_pages(); | ||
3831 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | ||
3832 | } | ||
3833 | |||
3834 | static ssize_t write_scan_unevictable_node(struct device *dev, | ||
3835 | struct device_attribute *attr, | ||
3836 | const char *buf, size_t count) | ||
3837 | { | ||
3838 | warn_scan_unevictable_pages(); | ||
3839 | return 1; | ||
3840 | } | ||
3841 | |||
3842 | |||
3843 | static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, | ||
3844 | read_scan_unevictable_node, | ||
3845 | write_scan_unevictable_node); | ||
3846 | |||
3847 | int scan_unevictable_register_node(struct node *node) | ||
3848 | { | ||
3849 | return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); | ||
3850 | } | ||
3851 | |||
3852 | void scan_unevictable_unregister_node(struct node *node) | ||
3853 | { | ||
3854 | device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); | ||
3855 | } | ||
3856 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index e9ab104b956f..1b12d390dc68 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -7,6 +7,7 @@ | |||
7 | * zoned VM statistics | 7 | * zoned VM statistics |
8 | * Copyright (C) 2006 Silicon Graphics, Inc., | 8 | * Copyright (C) 2006 Silicon Graphics, Inc., |
9 | * Christoph Lameter <christoph@lameter.com> | 9 | * Christoph Lameter <christoph@lameter.com> |
10 | * Copyright (C) 2008-2014 Christoph Lameter | ||
10 | */ | 11 | */ |
11 | #include <linux/fs.h> | 12 | #include <linux/fs.h> |
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
@@ -14,6 +15,7 @@ | |||
14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
18 | #include <linux/cpumask.h> | ||
17 | #include <linux/vmstat.h> | 19 | #include <linux/vmstat.h> |
18 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
19 | #include <linux/math64.h> | 21 | #include <linux/math64.h> |
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
419 | EXPORT_SYMBOL(dec_zone_page_state); | 421 | EXPORT_SYMBOL(dec_zone_page_state); |
420 | #endif | 422 | #endif |
421 | 423 | ||
422 | static inline void fold_diff(int *diff) | 424 | |
425 | /* | ||
426 | * Fold a differential into the global counters. | ||
427 | * Returns the number of counters updated. | ||
428 | */ | ||
429 | static int fold_diff(int *diff) | ||
423 | { | 430 | { |
424 | int i; | 431 | int i; |
432 | int changes = 0; | ||
425 | 433 | ||
426 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 434 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
427 | if (diff[i]) | 435 | if (diff[i]) { |
428 | atomic_long_add(diff[i], &vm_stat[i]); | 436 | atomic_long_add(diff[i], &vm_stat[i]); |
437 | changes++; | ||
438 | } | ||
439 | return changes; | ||
429 | } | 440 | } |
430 | 441 | ||
431 | /* | 442 | /* |
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff) | |||
441 | * statistics in the remote zone struct as well as the global cachelines | 452 | * statistics in the remote zone struct as well as the global cachelines |
442 | * with the global counters. These could cause remote node cache line | 453 | * with the global counters. These could cause remote node cache line |
443 | * bouncing and will have to be only done when necessary. | 454 | * bouncing and will have to be only done when necessary. |
455 | * | ||
456 | * The function returns the number of global counters updated. | ||
444 | */ | 457 | */ |
445 | static void refresh_cpu_vm_stats(void) | 458 | static int refresh_cpu_vm_stats(void) |
446 | { | 459 | { |
447 | struct zone *zone; | 460 | struct zone *zone; |
448 | int i; | 461 | int i; |
449 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; | 462 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; |
463 | int changes = 0; | ||
450 | 464 | ||
451 | for_each_populated_zone(zone) { | 465 | for_each_populated_zone(zone) { |
452 | struct per_cpu_pageset __percpu *p = zone->pageset; | 466 | struct per_cpu_pageset __percpu *p = zone->pageset; |
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void) | |||
486 | continue; | 500 | continue; |
487 | } | 501 | } |
488 | 502 | ||
489 | |||
490 | if (__this_cpu_dec_return(p->expire)) | 503 | if (__this_cpu_dec_return(p->expire)) |
491 | continue; | 504 | continue; |
492 | 505 | ||
493 | if (__this_cpu_read(p->pcp.count)) | 506 | if (__this_cpu_read(p->pcp.count)) { |
494 | drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); | 507 | drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); |
508 | changes++; | ||
509 | } | ||
495 | #endif | 510 | #endif |
496 | } | 511 | } |
497 | fold_diff(global_diff); | 512 | changes += fold_diff(global_diff); |
513 | return changes; | ||
498 | } | 514 | } |
499 | 515 | ||
500 | /* | 516 | /* |
@@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
735 | TEXT_FOR_HIGHMEM(xx) xx "_movable", | 751 | TEXT_FOR_HIGHMEM(xx) xx "_movable", |
736 | 752 | ||
737 | const char * const vmstat_text[] = { | 753 | const char * const vmstat_text[] = { |
738 | /* Zoned VM counters */ | 754 | /* enum zone_stat_item countes */ |
739 | "nr_free_pages", | 755 | "nr_free_pages", |
740 | "nr_alloc_batch", | 756 | "nr_alloc_batch", |
741 | "nr_inactive_anon", | 757 | "nr_inactive_anon", |
@@ -778,10 +794,13 @@ const char * const vmstat_text[] = { | |||
778 | "workingset_nodereclaim", | 794 | "workingset_nodereclaim", |
779 | "nr_anon_transparent_hugepages", | 795 | "nr_anon_transparent_hugepages", |
780 | "nr_free_cma", | 796 | "nr_free_cma", |
797 | |||
798 | /* enum writeback_stat_item counters */ | ||
781 | "nr_dirty_threshold", | 799 | "nr_dirty_threshold", |
782 | "nr_dirty_background_threshold", | 800 | "nr_dirty_background_threshold", |
783 | 801 | ||
784 | #ifdef CONFIG_VM_EVENT_COUNTERS | 802 | #ifdef CONFIG_VM_EVENT_COUNTERS |
803 | /* enum vm_event_item counters */ | ||
785 | "pgpgin", | 804 | "pgpgin", |
786 | "pgpgout", | 805 | "pgpgout", |
787 | "pswpin", | 806 | "pswpin", |
@@ -860,6 +879,13 @@ const char * const vmstat_text[] = { | |||
860 | "thp_zero_page_alloc", | 879 | "thp_zero_page_alloc", |
861 | "thp_zero_page_alloc_failed", | 880 | "thp_zero_page_alloc_failed", |
862 | #endif | 881 | #endif |
882 | #ifdef CONFIG_MEMORY_BALLOON | ||
883 | "balloon_inflate", | ||
884 | "balloon_deflate", | ||
885 | #ifdef CONFIG_BALLOON_COMPACTION | ||
886 | "balloon_migrate", | ||
887 | #endif | ||
888 | #endif /* CONFIG_MEMORY_BALLOON */ | ||
863 | #ifdef CONFIG_DEBUG_TLBFLUSH | 889 | #ifdef CONFIG_DEBUG_TLBFLUSH |
864 | #ifdef CONFIG_SMP | 890 | #ifdef CONFIG_SMP |
865 | "nr_tlb_remote_flush", | 891 | "nr_tlb_remote_flush", |
@@ -1229,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = { | |||
1229 | #ifdef CONFIG_SMP | 1255 | #ifdef CONFIG_SMP |
1230 | static DEFINE_PER_CPU(struct delayed_work, vmstat_work); | 1256 | static DEFINE_PER_CPU(struct delayed_work, vmstat_work); |
1231 | int sysctl_stat_interval __read_mostly = HZ; | 1257 | int sysctl_stat_interval __read_mostly = HZ; |
1258 | static cpumask_var_t cpu_stat_off; | ||
1232 | 1259 | ||
1233 | static void vmstat_update(struct work_struct *w) | 1260 | static void vmstat_update(struct work_struct *w) |
1234 | { | 1261 | { |
1235 | refresh_cpu_vm_stats(); | 1262 | if (refresh_cpu_vm_stats()) |
1236 | schedule_delayed_work(this_cpu_ptr(&vmstat_work), | 1263 | /* |
1264 | * Counters were updated so we expect more updates | ||
1265 | * to occur in the future. Keep on running the | ||
1266 | * update worker thread. | ||
1267 | */ | ||
1268 | schedule_delayed_work(this_cpu_ptr(&vmstat_work), | ||
1269 | round_jiffies_relative(sysctl_stat_interval)); | ||
1270 | else { | ||
1271 | /* | ||
1272 | * We did not update any counters so the app may be in | ||
1273 | * a mode where it does not cause counter updates. | ||
1274 | * We may be uselessly running vmstat_update. | ||
1275 | * Defer the checking for differentials to the | ||
1276 | * shepherd thread on a different processor. | ||
1277 | */ | ||
1278 | int r; | ||
1279 | /* | ||
1280 | * Shepherd work thread does not race since it never | ||
1281 | * changes the bit if its zero but the cpu | ||
1282 | * online / off line code may race if | ||
1283 | * worker threads are still allowed during | ||
1284 | * shutdown / startup. | ||
1285 | */ | ||
1286 | r = cpumask_test_and_set_cpu(smp_processor_id(), | ||
1287 | cpu_stat_off); | ||
1288 | VM_BUG_ON(r); | ||
1289 | } | ||
1290 | } | ||
1291 | |||
1292 | /* | ||
1293 | * Check if the diffs for a certain cpu indicate that | ||
1294 | * an update is needed. | ||
1295 | */ | ||
1296 | static bool need_update(int cpu) | ||
1297 | { | ||
1298 | struct zone *zone; | ||
1299 | |||
1300 | for_each_populated_zone(zone) { | ||
1301 | struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); | ||
1302 | |||
1303 | BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); | ||
1304 | /* | ||
1305 | * The fast way of checking if there are any vmstat diffs. | ||
1306 | * This works because the diffs are byte sized items. | ||
1307 | */ | ||
1308 | if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) | ||
1309 | return true; | ||
1310 | |||
1311 | } | ||
1312 | return false; | ||
1313 | } | ||
1314 | |||
1315 | |||
1316 | /* | ||
1317 | * Shepherd worker thread that checks the | ||
1318 | * differentials of processors that have their worker | ||
1319 | * threads for vm statistics updates disabled because of | ||
1320 | * inactivity. | ||
1321 | */ | ||
1322 | static void vmstat_shepherd(struct work_struct *w); | ||
1323 | |||
1324 | static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); | ||
1325 | |||
1326 | static void vmstat_shepherd(struct work_struct *w) | ||
1327 | { | ||
1328 | int cpu; | ||
1329 | |||
1330 | get_online_cpus(); | ||
1331 | /* Check processors whose vmstat worker threads have been disabled */ | ||
1332 | for_each_cpu(cpu, cpu_stat_off) | ||
1333 | if (need_update(cpu) && | ||
1334 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) | ||
1335 | |||
1336 | schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), | ||
1337 | __round_jiffies_relative(sysctl_stat_interval, cpu)); | ||
1338 | |||
1339 | put_online_cpus(); | ||
1340 | |||
1341 | schedule_delayed_work(&shepherd, | ||
1237 | round_jiffies_relative(sysctl_stat_interval)); | 1342 | round_jiffies_relative(sysctl_stat_interval)); |
1343 | |||
1238 | } | 1344 | } |
1239 | 1345 | ||
1240 | static void start_cpu_timer(int cpu) | 1346 | static void __init start_shepherd_timer(void) |
1241 | { | 1347 | { |
1242 | struct delayed_work *work = &per_cpu(vmstat_work, cpu); | 1348 | int cpu; |
1349 | |||
1350 | for_each_possible_cpu(cpu) | ||
1351 | INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), | ||
1352 | vmstat_update); | ||
1353 | |||
1354 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) | ||
1355 | BUG(); | ||
1356 | cpumask_copy(cpu_stat_off, cpu_online_mask); | ||
1243 | 1357 | ||
1244 | INIT_DEFERRABLE_WORK(work, vmstat_update); | 1358 | schedule_delayed_work(&shepherd, |
1245 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); | 1359 | round_jiffies_relative(sysctl_stat_interval)); |
1246 | } | 1360 | } |
1247 | 1361 | ||
1248 | static void vmstat_cpu_dead(int node) | 1362 | static void vmstat_cpu_dead(int node) |
@@ -1273,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, | |||
1273 | case CPU_ONLINE: | 1387 | case CPU_ONLINE: |
1274 | case CPU_ONLINE_FROZEN: | 1388 | case CPU_ONLINE_FROZEN: |
1275 | refresh_zone_stat_thresholds(); | 1389 | refresh_zone_stat_thresholds(); |
1276 | start_cpu_timer(cpu); | ||
1277 | node_set_state(cpu_to_node(cpu), N_CPU); | 1390 | node_set_state(cpu_to_node(cpu), N_CPU); |
1391 | cpumask_set_cpu(cpu, cpu_stat_off); | ||
1278 | break; | 1392 | break; |
1279 | case CPU_DOWN_PREPARE: | 1393 | case CPU_DOWN_PREPARE: |
1280 | case CPU_DOWN_PREPARE_FROZEN: | 1394 | case CPU_DOWN_PREPARE_FROZEN: |
1281 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); | 1395 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); |
1282 | per_cpu(vmstat_work, cpu).work.func = NULL; | 1396 | cpumask_clear_cpu(cpu, cpu_stat_off); |
1283 | break; | 1397 | break; |
1284 | case CPU_DOWN_FAILED: | 1398 | case CPU_DOWN_FAILED: |
1285 | case CPU_DOWN_FAILED_FROZEN: | 1399 | case CPU_DOWN_FAILED_FROZEN: |
1286 | start_cpu_timer(cpu); | 1400 | cpumask_set_cpu(cpu, cpu_stat_off); |
1287 | break; | 1401 | break; |
1288 | case CPU_DEAD: | 1402 | case CPU_DEAD: |
1289 | case CPU_DEAD_FROZEN: | 1403 | case CPU_DEAD_FROZEN: |
@@ -1303,15 +1417,10 @@ static struct notifier_block vmstat_notifier = | |||
1303 | static int __init setup_vmstat(void) | 1417 | static int __init setup_vmstat(void) |
1304 | { | 1418 | { |
1305 | #ifdef CONFIG_SMP | 1419 | #ifdef CONFIG_SMP |
1306 | int cpu; | ||
1307 | |||
1308 | cpu_notifier_register_begin(); | 1420 | cpu_notifier_register_begin(); |
1309 | __register_cpu_notifier(&vmstat_notifier); | 1421 | __register_cpu_notifier(&vmstat_notifier); |
1310 | 1422 | ||
1311 | for_each_online_cpu(cpu) { | 1423 | start_shepherd_timer(); |
1312 | start_cpu_timer(cpu); | ||
1313 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
1314 | } | ||
1315 | cpu_notifier_register_done(); | 1424 | cpu_notifier_register_done(); |
1316 | #endif | 1425 | #endif |
1317 | #ifdef CONFIG_PROC_FS | 1426 | #ifdef CONFIG_PROC_FS |
@@ -60,15 +60,17 @@ | |||
60 | * NCHUNKS_ORDER determines the internal allocation granularity, effectively | 60 | * NCHUNKS_ORDER determines the internal allocation granularity, effectively |
61 | * adjusting internal fragmentation. It also determines the number of | 61 | * adjusting internal fragmentation. It also determines the number of |
62 | * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the | 62 | * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the |
63 | * allocation granularity will be in chunks of size PAGE_SIZE/64, and there | 63 | * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk |
64 | * will be 64 freelists per pool. | 64 | * in allocated page is occupied by zbud header, NCHUNKS will be calculated to |
65 | * 63 which shows the max number of free chunks in zbud page, also there will be | ||
66 | * 63 freelists per pool. | ||
65 | */ | 67 | */ |
66 | #define NCHUNKS_ORDER 6 | 68 | #define NCHUNKS_ORDER 6 |
67 | 69 | ||
68 | #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) | 70 | #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) |
69 | #define CHUNK_SIZE (1 << CHUNK_SHIFT) | 71 | #define CHUNK_SIZE (1 << CHUNK_SHIFT) |
70 | #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) | ||
71 | #define ZHDR_SIZE_ALIGNED CHUNK_SIZE | 72 | #define ZHDR_SIZE_ALIGNED CHUNK_SIZE |
73 | #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) | ||
72 | 74 | ||
73 | /** | 75 | /** |
74 | * struct zbud_pool - stores metadata for each zbud pool | 76 | * struct zbud_pool - stores metadata for each zbud pool |
@@ -195,6 +197,7 @@ static struct zpool_driver zbud_zpool_driver = { | |||
195 | .total_size = zbud_zpool_total_size, | 197 | .total_size = zbud_zpool_total_size, |
196 | }; | 198 | }; |
197 | 199 | ||
200 | MODULE_ALIAS("zpool-zbud"); | ||
198 | #endif /* CONFIG_ZPOOL */ | 201 | #endif /* CONFIG_ZPOOL */ |
199 | 202 | ||
200 | /***************** | 203 | /***************** |
@@ -267,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr) | |||
267 | { | 270 | { |
268 | /* | 271 | /* |
269 | * Rather than branch for different situations, just use the fact that | 272 | * Rather than branch for different situations, just use the fact that |
270 | * free buddies have a length of zero to simplify everything. -1 at the | 273 | * free buddies have a length of zero to simplify everything. |
271 | * end for the zbud header. | ||
272 | */ | 274 | */ |
273 | return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; | 275 | return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; |
274 | } | 276 | } |
275 | 277 | ||
276 | /***************** | 278 | /***************** |
diff --git a/mm/zpool.c b/mm/zpool.c index e40612a1df00..739cdf0d183a 100644 --- a/mm/zpool.c +++ b/mm/zpool.c | |||
@@ -150,7 +150,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) | |||
150 | driver = zpool_get_driver(type); | 150 | driver = zpool_get_driver(type); |
151 | 151 | ||
152 | if (!driver) { | 152 | if (!driver) { |
153 | request_module(type); | 153 | request_module("zpool-%s", type); |
154 | driver = zpool_get_driver(type); | 154 | driver = zpool_get_driver(type); |
155 | } | 155 | } |
156 | 156 | ||
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 4e2fc83cb394..839a48c3ca27 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -175,7 +175,7 @@ enum fullness_group { | |||
175 | * n <= N / f, where | 175 | * n <= N / f, where |
176 | * n = number of allocated objects | 176 | * n = number of allocated objects |
177 | * N = total number of objects zspage can store | 177 | * N = total number of objects zspage can store |
178 | * f = 1/fullness_threshold_frac | 178 | * f = fullness_threshold_frac |
179 | * | 179 | * |
180 | * Similarly, we assign zspage to: | 180 | * Similarly, we assign zspage to: |
181 | * ZS_ALMOST_FULL when n > N / f | 181 | * ZS_ALMOST_FULL when n > N / f |
@@ -199,9 +199,6 @@ struct size_class { | |||
199 | 199 | ||
200 | spinlock_t lock; | 200 | spinlock_t lock; |
201 | 201 | ||
202 | /* stats */ | ||
203 | u64 pages_allocated; | ||
204 | |||
205 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | 202 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; |
206 | }; | 203 | }; |
207 | 204 | ||
@@ -220,6 +217,7 @@ struct zs_pool { | |||
220 | struct size_class size_class[ZS_SIZE_CLASSES]; | 217 | struct size_class size_class[ZS_SIZE_CLASSES]; |
221 | 218 | ||
222 | gfp_t flags; /* allocation flags used when growing pool */ | 219 | gfp_t flags; /* allocation flags used when growing pool */ |
220 | atomic_long_t pages_allocated; | ||
223 | }; | 221 | }; |
224 | 222 | ||
225 | /* | 223 | /* |
@@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) | |||
299 | 297 | ||
300 | static u64 zs_zpool_total_size(void *pool) | 298 | static u64 zs_zpool_total_size(void *pool) |
301 | { | 299 | { |
302 | return zs_get_total_size_bytes(pool); | 300 | return zs_get_total_pages(pool) << PAGE_SHIFT; |
303 | } | 301 | } |
304 | 302 | ||
305 | static struct zpool_driver zs_zpool_driver = { | 303 | static struct zpool_driver zs_zpool_driver = { |
@@ -315,6 +313,7 @@ static struct zpool_driver zs_zpool_driver = { | |||
315 | .total_size = zs_zpool_total_size, | 313 | .total_size = zs_zpool_total_size, |
316 | }; | 314 | }; |
317 | 315 | ||
316 | MODULE_ALIAS("zpool-zsmalloc"); | ||
318 | #endif /* CONFIG_ZPOOL */ | 317 | #endif /* CONFIG_ZPOOL */ |
319 | 318 | ||
320 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | 319 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
@@ -629,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
629 | while (page) { | 628 | while (page) { |
630 | struct page *next_page; | 629 | struct page *next_page; |
631 | struct link_free *link; | 630 | struct link_free *link; |
632 | unsigned int i, objs_on_page; | 631 | unsigned int i = 1; |
633 | 632 | ||
634 | /* | 633 | /* |
635 | * page->index stores offset of first object starting | 634 | * page->index stores offset of first object starting |
@@ -642,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
642 | 641 | ||
643 | link = (struct link_free *)kmap_atomic(page) + | 642 | link = (struct link_free *)kmap_atomic(page) + |
644 | off / sizeof(*link); | 643 | off / sizeof(*link); |
645 | objs_on_page = (PAGE_SIZE - off) / class->size; | ||
646 | 644 | ||
647 | for (i = 1; i <= objs_on_page; i++) { | 645 | while ((off += class->size) < PAGE_SIZE) { |
648 | off += class->size; | 646 | link->next = obj_location_to_handle(page, i++); |
649 | if (off < PAGE_SIZE) { | 647 | link += class->size / sizeof(*link); |
650 | link->next = obj_location_to_handle(page, i); | ||
651 | link += class->size / sizeof(*link); | ||
652 | } | ||
653 | } | 648 | } |
654 | 649 | ||
655 | /* | 650 | /* |
@@ -661,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
661 | link->next = obj_location_to_handle(next_page, 0); | 656 | link->next = obj_location_to_handle(next_page, 0); |
662 | kunmap_atomic(link); | 657 | kunmap_atomic(link); |
663 | page = next_page; | 658 | page = next_page; |
664 | off = (off + class->size) % PAGE_SIZE; | 659 | off %= PAGE_SIZE; |
665 | } | 660 | } |
666 | } | 661 | } |
667 | 662 | ||
@@ -1027,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1027 | return 0; | 1022 | return 0; |
1028 | 1023 | ||
1029 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1024 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); |
1025 | atomic_long_add(class->pages_per_zspage, | ||
1026 | &pool->pages_allocated); | ||
1030 | spin_lock(&class->lock); | 1027 | spin_lock(&class->lock); |
1031 | class->pages_allocated += class->pages_per_zspage; | ||
1032 | } | 1028 | } |
1033 | 1029 | ||
1034 | obj = (unsigned long)first_page->freelist; | 1030 | obj = (unsigned long)first_page->freelist; |
@@ -1081,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
1081 | 1077 | ||
1082 | first_page->inuse--; | 1078 | first_page->inuse--; |
1083 | fullness = fix_fullness_group(pool, first_page); | 1079 | fullness = fix_fullness_group(pool, first_page); |
1084 | |||
1085 | if (fullness == ZS_EMPTY) | ||
1086 | class->pages_allocated -= class->pages_per_zspage; | ||
1087 | |||
1088 | spin_unlock(&class->lock); | 1080 | spin_unlock(&class->lock); |
1089 | 1081 | ||
1090 | if (fullness == ZS_EMPTY) | 1082 | if (fullness == ZS_EMPTY) { |
1083 | atomic_long_sub(class->pages_per_zspage, | ||
1084 | &pool->pages_allocated); | ||
1091 | free_zspage(first_page); | 1085 | free_zspage(first_page); |
1086 | } | ||
1092 | } | 1087 | } |
1093 | EXPORT_SYMBOL_GPL(zs_free); | 1088 | EXPORT_SYMBOL_GPL(zs_free); |
1094 | 1089 | ||
@@ -1182,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1182 | } | 1177 | } |
1183 | EXPORT_SYMBOL_GPL(zs_unmap_object); | 1178 | EXPORT_SYMBOL_GPL(zs_unmap_object); |
1184 | 1179 | ||
1185 | u64 zs_get_total_size_bytes(struct zs_pool *pool) | 1180 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
1186 | { | 1181 | { |
1187 | int i; | 1182 | return atomic_long_read(&pool->pages_allocated); |
1188 | u64 npages = 0; | ||
1189 | |||
1190 | for (i = 0; i < ZS_SIZE_CLASSES; i++) | ||
1191 | npages += pool->size_class[i].pages_allocated; | ||
1192 | |||
1193 | return npages << PAGE_SHIFT; | ||
1194 | } | 1183 | } |
1195 | EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); | 1184 | EXPORT_SYMBOL_GPL(zs_get_total_pages); |
1196 | 1185 | ||
1197 | module_init(zs_init); | 1186 | module_init(zs_init); |
1198 | module_exit(zs_exit); | 1187 | module_exit(zs_exit); |