aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile5
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/balloon_compaction.c123
-rw-r--r--mm/bootmem.c4
-rw-r--r--mm/cma.c21
-rw-r--r--mm/compaction.c674
-rw-r--r--mm/debug.c237
-rw-r--r--mm/dmapool.c58
-rw-r--r--mm/filemap.c27
-rw-r--r--mm/gup.c358
-rw-r--r--mm/huge_memory.c35
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h26
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/iov_iter.c14
-rw-r--r--mm/kmemcheck.c1
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memblock.c7
-rw-r--r--mm/memcontrol.c421
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c11
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c134
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mmap.c84
-rw-r--r--mm/mmu_notifier.c5
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c355
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu-km.c16
-rw-r--r--mm/percpu-vm.c184
-rw-r--r--mm/percpu.c524
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c364
-rw-r--r--mm/slab.h57
-rw-r--r--mm/slab_common.c178
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c126
-rw-r--r--mm/swap.c30
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c112
-rw-r--r--mm/vmstat.c153
-rw-r--r--mm/zbud.c14
-rw-r--r--mm/zpool.c2
-rw-r--r--mm/zsmalloc.c47
56 files changed, 2793 insertions, 1795 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 886db2158538..1d1ae6b078fd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
137config HAVE_MEMBLOCK_PHYS_MAP 137config HAVE_MEMBLOCK_PHYS_MAP
138 boolean 138 boolean
139 139
140config HAVE_GENERIC_RCU_GUP
141 boolean
142
140config ARCH_DISCARD_MEMBLOCK 143config ARCH_DISCARD_MEMBLOCK
141 boolean 144 boolean
142 145
@@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
228 boolean 231 boolean
229 232
230# 233#
234# support for memory balloon
235config MEMORY_BALLOON
236 boolean
237
238#
231# support for memory balloon compaction 239# support for memory balloon compaction
232config BALLOON_COMPACTION 240config BALLOON_COMPACTION
233 bool "Allow for balloon memory compaction/migration" 241 bool "Allow for balloon memory compaction/migration"
234 def_bool y 242 def_bool y
235 depends on COMPACTION && VIRTIO_BALLOON 243 depends on COMPACTION && MEMORY_BALLOON
236 help 244 help
237 Memory fragmentation introduced by ballooning might reduce 245 Memory fragmentation introduced by ballooning might reduce
238 significantly the number of 2MB contiguous memory blocks that can be 246 significantly the number of 2MB contiguous memory blocks that can be
diff --git a/mm/Makefile b/mm/Makefile
index 2ad574d1d12d..8405eb0023a9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,9 +16,9 @@ obj-y := filemap.o mempool.o oom_kill.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o balloon_compaction.o vmacache.o \ 19 compaction.o vmacache.o \
20 interval_tree.o list_lru.o workingset.o \ 20 interval_tree.o list_lru.o workingset.o \
21 iov_iter.o $(mmu-y) 21 iov_iter.o debug.o $(mmu-y)
22 22
23obj-y += init-mm.o 23obj-y += init-mm.o
24 24
@@ -68,3 +68,4 @@ obj-$(CONFIG_ZBUD) += zbud.o
68obj-$(CONFIG_ZSMALLOC) += zsmalloc.o 68obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
69obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o 69obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
70obj-$(CONFIG_CMA) += cma.o 70obj-$(CONFIG_CMA) += cma.o
71obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1706cbbdf5f0..12a992b62576 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi)
455 bdi_wb_init(&bdi->wb, bdi); 455 bdi_wb_init(&bdi->wb, bdi);
456 456
457 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 457 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
458 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 458 err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
459 if (err) 459 if (err)
460 goto err; 460 goto err;
461 } 461 }
@@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi)
470 bdi->write_bandwidth = INIT_BW; 470 bdi->write_bandwidth = INIT_BW;
471 bdi->avg_write_bandwidth = INIT_BW; 471 bdi->avg_write_bandwidth = INIT_BW;
472 472
473 err = fprop_local_init_percpu(&bdi->completions); 473 err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
474 474
475 if (err) { 475 if (err) {
476err: 476err:
@@ -631,7 +631,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
631 * of sleeping on the congestion queue 631 * of sleeping on the congestion queue
632 */ 632 */
633 if (atomic_read(&nr_bdi_congested[sync]) == 0 || 633 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
634 !zone_is_reclaim_congested(zone)) { 634 !test_bit(ZONE_CONGESTED, &zone->flags)) {
635 cond_resched(); 635 cond_resched();
636 636
637 /* In case we scheduled, work out time remaining */ 637 /* In case we scheduled, work out time remaining */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6e45a5074bf0..b3cbe19f71b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -11,32 +11,6 @@
11#include <linux/balloon_compaction.h> 11#include <linux/balloon_compaction.h>
12 12
13/* 13/*
14 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
15 * @balloon_dev_descriptor: pointer to reference the balloon device which
16 * this struct balloon_dev_info will be servicing.
17 *
18 * Driver must call it to properly allocate and initialize an instance of
19 * struct balloon_dev_info which will be used to reference a balloon device
20 * as well as to keep track of the balloon device page list.
21 */
22struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
23{
24 struct balloon_dev_info *b_dev_info;
25 b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
26 if (!b_dev_info)
27 return ERR_PTR(-ENOMEM);
28
29 b_dev_info->balloon_device = balloon_dev_descriptor;
30 b_dev_info->mapping = NULL;
31 b_dev_info->isolated_pages = 0;
32 spin_lock_init(&b_dev_info->pages_lock);
33 INIT_LIST_HEAD(&b_dev_info->pages);
34
35 return b_dev_info;
36}
37EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
38
39/*
40 * balloon_page_enqueue - allocates a new page and inserts it into the balloon 14 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
41 * page list. 15 * page list.
42 * @b_dev_info: balloon device decriptor where we will insert a new page to 16 * @b_dev_info: balloon device decriptor where we will insert a new page to
@@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
61 */ 35 */
62 BUG_ON(!trylock_page(page)); 36 BUG_ON(!trylock_page(page));
63 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 37 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
64 balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); 38 balloon_page_insert(b_dev_info, page);
39 __count_vm_event(BALLOON_INFLATE);
65 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 40 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
66 unlock_page(page); 41 unlock_page(page);
67 return page; 42 return page;
@@ -93,18 +68,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
93 * to be released by the balloon driver. 68 * to be released by the balloon driver.
94 */ 69 */
95 if (trylock_page(page)) { 70 if (trylock_page(page)) {
71 if (!PagePrivate(page)) {
72 /* raced with isolation */
73 unlock_page(page);
74 continue;
75 }
96 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 76 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
97 /*
98 * Raise the page refcount here to prevent any wrong
99 * attempt to isolate this page, in case of coliding
100 * with balloon_page_isolate() just after we release
101 * the page lock.
102 *
103 * balloon_page_free() will take care of dropping
104 * this extra refcount later.
105 */
106 get_page(page);
107 balloon_page_delete(page); 77 balloon_page_delete(page);
78 __count_vm_event(BALLOON_DEFLATE);
108 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 79 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
109 unlock_page(page); 80 unlock_page(page);
110 dequeued_page = true; 81 dequeued_page = true;
@@ -132,62 +103,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
132EXPORT_SYMBOL_GPL(balloon_page_dequeue); 103EXPORT_SYMBOL_GPL(balloon_page_dequeue);
133 104
134#ifdef CONFIG_BALLOON_COMPACTION 105#ifdef CONFIG_BALLOON_COMPACTION
135/*
136 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
137 * @b_dev_info: holds the balloon device information descriptor.
138 * @a_ops: balloon_mapping address_space_operations descriptor.
139 *
140 * Driver must call it to properly allocate and initialize an instance of
141 * struct address_space which will be used as the special page->mapping for
142 * balloon device enlisted page instances.
143 */
144struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
145 const struct address_space_operations *a_ops)
146{
147 struct address_space *mapping;
148
149 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
150 if (!mapping)
151 return ERR_PTR(-ENOMEM);
152
153 /*
154 * Give a clean 'zeroed' status to all elements of this special
155 * balloon page->mapping struct address_space instance.
156 */
157 address_space_init_once(mapping);
158
159 /*
160 * Set mapping->flags appropriately, to allow balloon pages
161 * ->mapping identification.
162 */
163 mapping_set_balloon(mapping);
164 mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
165
166 /* balloon's page->mapping->a_ops callback descriptor */
167 mapping->a_ops = a_ops;
168
169 /*
170 * Establish a pointer reference back to the balloon device descriptor
171 * this particular page->mapping will be servicing.
172 * This is used by compaction / migration procedures to identify and
173 * access the balloon device pageset while isolating / migrating pages.
174 *
175 * As some balloon drivers can register multiple balloon devices
176 * for a single guest, this also helps compaction / migration to
177 * properly deal with multiple balloon pagesets, when required.
178 */
179 mapping->private_data = b_dev_info;
180 b_dev_info->mapping = mapping;
181
182 return mapping;
183}
184EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
185 106
186static inline void __isolate_balloon_page(struct page *page) 107static inline void __isolate_balloon_page(struct page *page)
187{ 108{
188 struct balloon_dev_info *b_dev_info = page->mapping->private_data; 109 struct balloon_dev_info *b_dev_info = balloon_page_device(page);
189 unsigned long flags; 110 unsigned long flags;
111
190 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 112 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
113 ClearPagePrivate(page);
191 list_del(&page->lru); 114 list_del(&page->lru);
192 b_dev_info->isolated_pages++; 115 b_dev_info->isolated_pages++;
193 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 116 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
@@ -195,20 +118,16 @@ static inline void __isolate_balloon_page(struct page *page)
195 118
196static inline void __putback_balloon_page(struct page *page) 119static inline void __putback_balloon_page(struct page *page)
197{ 120{
198 struct balloon_dev_info *b_dev_info = page->mapping->private_data; 121 struct balloon_dev_info *b_dev_info = balloon_page_device(page);
199 unsigned long flags; 122 unsigned long flags;
123
200 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 124 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
125 SetPagePrivate(page);
201 list_add(&page->lru, &b_dev_info->pages); 126 list_add(&page->lru, &b_dev_info->pages);
202 b_dev_info->isolated_pages--; 127 b_dev_info->isolated_pages--;
203 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 128 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
204} 129}
205 130
206static inline int __migrate_balloon_page(struct address_space *mapping,
207 struct page *newpage, struct page *page, enum migrate_mode mode)
208{
209 return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
210}
211
212/* __isolate_lru_page() counterpart for a ballooned page */ 131/* __isolate_lru_page() counterpart for a ballooned page */
213bool balloon_page_isolate(struct page *page) 132bool balloon_page_isolate(struct page *page)
214{ 133{
@@ -235,12 +154,11 @@ bool balloon_page_isolate(struct page *page)
235 */ 154 */
236 if (likely(trylock_page(page))) { 155 if (likely(trylock_page(page))) {
237 /* 156 /*
238 * A ballooned page, by default, has just one refcount. 157 * A ballooned page, by default, has PagePrivate set.
239 * Prevent concurrent compaction threads from isolating 158 * Prevent concurrent compaction threads from isolating
240 * an already isolated balloon page by refcount check. 159 * an already isolated balloon page by clearing it.
241 */ 160 */
242 if (__is_movable_balloon_page(page) && 161 if (balloon_page_movable(page)) {
243 page_count(page) == 2) {
244 __isolate_balloon_page(page); 162 __isolate_balloon_page(page);
245 unlock_page(page); 163 unlock_page(page);
246 return true; 164 return true;
@@ -276,7 +194,7 @@ void balloon_page_putback(struct page *page)
276int balloon_page_migrate(struct page *newpage, 194int balloon_page_migrate(struct page *newpage,
277 struct page *page, enum migrate_mode mode) 195 struct page *page, enum migrate_mode mode)
278{ 196{
279 struct address_space *mapping; 197 struct balloon_dev_info *balloon = balloon_page_device(page);
280 int rc = -EAGAIN; 198 int rc = -EAGAIN;
281 199
282 /* 200 /*
@@ -292,9 +210,8 @@ int balloon_page_migrate(struct page *newpage,
292 return rc; 210 return rc;
293 } 211 }
294 212
295 mapping = page->mapping; 213 if (balloon && balloon->migratepage)
296 if (mapping) 214 rc = balloon->migratepage(balloon, newpage, page, mode);
297 rc = __migrate_balloon_page(mapping, newpage, page, mode);
298 215
299 unlock_page(newpage); 216 unlock_page(newpage);
300 return rc; 217 return rc;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 90bd3507b413..8a000cebb0d7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -16,9 +16,9 @@
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h> 18#include <linux/memblock.h>
19#include <linux/bug.h>
20#include <linux/io.h>
19 21
20#include <asm/bug.h>
21#include <asm/io.h>
22#include <asm/processor.h> 22#include <asm/processor.h>
23 23
24#include "internal.h" 24#include "internal.h"
diff --git a/mm/cma.c b/mm/cma.c
index c17751c0dcaf..474c644a0dc6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -32,6 +32,7 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/log2.h> 33#include <linux/log2.h>
34#include <linux/cma.h> 34#include <linux/cma.h>
35#include <linux/highmem.h>
35 36
36struct cma { 37struct cma {
37 unsigned long base_pfn; 38 unsigned long base_pfn;
@@ -163,6 +164,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
163 bool fixed, struct cma **res_cma) 164 bool fixed, struct cma **res_cma)
164{ 165{
165 struct cma *cma; 166 struct cma *cma;
167 phys_addr_t memblock_end = memblock_end_of_DRAM();
168 phys_addr_t highmem_start = __pa(high_memory);
166 int ret = 0; 169 int ret = 0;
167 170
168 pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", 171 pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
@@ -196,6 +199,24 @@ int __init cma_declare_contiguous(phys_addr_t base,
196 if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) 199 if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
197 return -EINVAL; 200 return -EINVAL;
198 201
202 /*
203 * adjust limit to avoid crossing low/high memory boundary for
204 * automatically allocated regions
205 */
206 if (((limit == 0 || limit > memblock_end) &&
207 (memblock_end - size < highmem_start &&
208 memblock_end > highmem_start)) ||
209 (!fixed && limit > highmem_start && limit - size < highmem_start)) {
210 limit = highmem_start;
211 }
212
213 if (fixed && base < highmem_start && base+size > highmem_start) {
214 ret = -EINVAL;
215 pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
216 (unsigned long)base, (unsigned long)highmem_start);
217 goto err;
218 }
219
199 /* Reserve memory */ 220 /* Reserve memory */
200 if (base && fixed) { 221 if (base && fixed) {
201 if (memblock_is_region_reserved(base, size) || 222 if (memblock_is_region_reserved(base, size) ||
diff --git a/mm/compaction.c b/mm/compaction.c
index 21bf292b642a..edba18aed173 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype)
67 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 67 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
68} 68}
69 69
70/*
71 * Check that the whole (or subset of) a pageblock given by the interval of
72 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
73 * with the migration of free compaction scanner. The scanners then need to
74 * use only pfn_valid_within() check for arches that allow holes within
75 * pageblocks.
76 *
77 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
78 *
79 * It's possible on some configurations to have a setup like node0 node1 node0
80 * i.e. it's possible that all pages within a zones range of pages do not
81 * belong to a single zone. We assume that a border between node0 and node1
82 * can occur within a single pageblock, but not a node0 node1 node0
83 * interleaving within a single pageblock. It is therefore sufficient to check
84 * the first and last page of a pageblock and avoid checking each individual
85 * page in a pageblock.
86 */
87static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
88 unsigned long end_pfn, struct zone *zone)
89{
90 struct page *start_page;
91 struct page *end_page;
92
93 /* end_pfn is one past the range we are checking */
94 end_pfn--;
95
96 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
97 return NULL;
98
99 start_page = pfn_to_page(start_pfn);
100
101 if (page_zone(start_page) != zone)
102 return NULL;
103
104 end_page = pfn_to_page(end_pfn);
105
106 /* This gives a shorter code than deriving page_zone(end_page) */
107 if (page_zone_id(start_page) != page_zone_id(end_page))
108 return NULL;
109
110 return start_page;
111}
112
70#ifdef CONFIG_COMPACTION 113#ifdef CONFIG_COMPACTION
71/* Returns true if the pageblock should be scanned for pages to isolate. */ 114/* Returns true if the pageblock should be scanned for pages to isolate. */
72static inline bool isolation_suitable(struct compact_control *cc, 115static inline bool isolation_suitable(struct compact_control *cc,
@@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat)
132 */ 175 */
133static void update_pageblock_skip(struct compact_control *cc, 176static void update_pageblock_skip(struct compact_control *cc,
134 struct page *page, unsigned long nr_isolated, 177 struct page *page, unsigned long nr_isolated,
135 bool set_unsuitable, bool migrate_scanner) 178 bool migrate_scanner)
136{ 179{
137 struct zone *zone = cc->zone; 180 struct zone *zone = cc->zone;
138 unsigned long pfn; 181 unsigned long pfn;
@@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc,
146 if (nr_isolated) 189 if (nr_isolated)
147 return; 190 return;
148 191
149 /* 192 set_pageblock_skip(page);
150 * Only skip pageblocks when all forms of compaction will be known to
151 * fail in the near future.
152 */
153 if (set_unsuitable)
154 set_pageblock_skip(page);
155 193
156 pfn = page_to_pfn(page); 194 pfn = page_to_pfn(page);
157 195
@@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc,
180 218
181static void update_pageblock_skip(struct compact_control *cc, 219static void update_pageblock_skip(struct compact_control *cc,
182 struct page *page, unsigned long nr_isolated, 220 struct page *page, unsigned long nr_isolated,
183 bool set_unsuitable, bool migrate_scanner) 221 bool migrate_scanner)
184{ 222{
185} 223}
186#endif /* CONFIG_COMPACTION */ 224#endif /* CONFIG_COMPACTION */
187 225
188static inline bool should_release_lock(spinlock_t *lock) 226/*
227 * Compaction requires the taking of some coarse locks that are potentially
228 * very heavily contended. For async compaction, back out if the lock cannot
229 * be taken immediately. For sync compaction, spin on the lock if needed.
230 *
231 * Returns true if the lock is held
232 * Returns false if the lock is not held and compaction should abort
233 */
234static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
235 struct compact_control *cc)
189{ 236{
190 return need_resched() || spin_is_contended(lock); 237 if (cc->mode == MIGRATE_ASYNC) {
238 if (!spin_trylock_irqsave(lock, *flags)) {
239 cc->contended = COMPACT_CONTENDED_LOCK;
240 return false;
241 }
242 } else {
243 spin_lock_irqsave(lock, *flags);
244 }
245
246 return true;
191} 247}
192 248
193/* 249/*
194 * Compaction requires the taking of some coarse locks that are potentially 250 * Compaction requires the taking of some coarse locks that are potentially
195 * very heavily contended. Check if the process needs to be scheduled or 251 * very heavily contended. The lock should be periodically unlocked to avoid
196 * if the lock is contended. For async compaction, back out in the event 252 * having disabled IRQs for a long time, even when there is nobody waiting on
197 * if contention is severe. For sync compaction, schedule. 253 * the lock. It might also be that allowing the IRQs will result in
254 * need_resched() becoming true. If scheduling is needed, async compaction
255 * aborts. Sync compaction schedules.
256 * Either compaction type will also abort if a fatal signal is pending.
257 * In either case if the lock was locked, it is dropped and not regained.
198 * 258 *
199 * Returns true if the lock is held. 259 * Returns true if compaction should abort due to fatal signal pending, or
200 * Returns false if the lock is released and compaction should abort 260 * async compaction due to need_resched()
261 * Returns false when compaction can continue (sync compaction might have
262 * scheduled)
201 */ 263 */
202static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, 264static bool compact_unlock_should_abort(spinlock_t *lock,
203 bool locked, struct compact_control *cc) 265 unsigned long flags, bool *locked, struct compact_control *cc)
204{ 266{
205 if (should_release_lock(lock)) { 267 if (*locked) {
206 if (locked) { 268 spin_unlock_irqrestore(lock, flags);
207 spin_unlock_irqrestore(lock, *flags); 269 *locked = false;
208 locked = false; 270 }
209 } 271
272 if (fatal_signal_pending(current)) {
273 cc->contended = COMPACT_CONTENDED_SCHED;
274 return true;
275 }
210 276
211 /* async aborts if taking too long or contended */ 277 if (need_resched()) {
212 if (cc->mode == MIGRATE_ASYNC) { 278 if (cc->mode == MIGRATE_ASYNC) {
213 cc->contended = true; 279 cc->contended = COMPACT_CONTENDED_SCHED;
214 return false; 280 return true;
215 } 281 }
216
217 cond_resched(); 282 cond_resched();
218 } 283 }
219 284
220 if (!locked) 285 return false;
221 spin_lock_irqsave(lock, *flags);
222 return true;
223} 286}
224 287
225/* 288/*
226 * Aside from avoiding lock contention, compaction also periodically checks 289 * Aside from avoiding lock contention, compaction also periodically checks
227 * need_resched() and either schedules in sync compaction or aborts async 290 * need_resched() and either schedules in sync compaction or aborts async
228 * compaction. This is similar to what compact_checklock_irqsave() does, but 291 * compaction. This is similar to what compact_unlock_should_abort() does, but
229 * is used where no lock is concerned. 292 * is used where no lock is concerned.
230 * 293 *
231 * Returns false when no scheduling was needed, or sync compaction scheduled. 294 * Returns false when no scheduling was needed, or sync compaction scheduled.
@@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
236 /* async compaction aborts if contended */ 299 /* async compaction aborts if contended */
237 if (need_resched()) { 300 if (need_resched()) {
238 if (cc->mode == MIGRATE_ASYNC) { 301 if (cc->mode == MIGRATE_ASYNC) {
239 cc->contended = true; 302 cc->contended = COMPACT_CONTENDED_SCHED;
240 return true; 303 return true;
241 } 304 }
242 305
@@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc)
250static bool suitable_migration_target(struct page *page) 313static bool suitable_migration_target(struct page *page)
251{ 314{
252 /* If the page is a large free page, then disallow migration */ 315 /* If the page is a large free page, then disallow migration */
253 if (PageBuddy(page) && page_order(page) >= pageblock_order) 316 if (PageBuddy(page)) {
254 return false; 317 /*
318 * We are checking page_order without zone->lock taken. But
319 * the only small danger is that we skip a potentially suitable
320 * pageblock, so it's not worth to check order for valid range.
321 */
322 if (page_order_unsafe(page) >= pageblock_order)
323 return false;
324 }
255 325
256 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 326 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
257 if (migrate_async_suitable(get_pageblock_migratetype(page))) 327 if (migrate_async_suitable(get_pageblock_migratetype(page)))
@@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page)
267 * (even though it may still end up isolating some pages). 337 * (even though it may still end up isolating some pages).
268 */ 338 */
269static unsigned long isolate_freepages_block(struct compact_control *cc, 339static unsigned long isolate_freepages_block(struct compact_control *cc,
270 unsigned long blockpfn, 340 unsigned long *start_pfn,
271 unsigned long end_pfn, 341 unsigned long end_pfn,
272 struct list_head *freelist, 342 struct list_head *freelist,
273 bool strict) 343 bool strict)
274{ 344{
275 int nr_scanned = 0, total_isolated = 0; 345 int nr_scanned = 0, total_isolated = 0;
276 struct page *cursor, *valid_page = NULL; 346 struct page *cursor, *valid_page = NULL;
277 unsigned long flags; 347 unsigned long flags = 0;
278 bool locked = false; 348 bool locked = false;
279 bool checked_pageblock = false; 349 unsigned long blockpfn = *start_pfn;
280 350
281 cursor = pfn_to_page(blockpfn); 351 cursor = pfn_to_page(blockpfn);
282 352
@@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
285 int isolated, i; 355 int isolated, i;
286 struct page *page = cursor; 356 struct page *page = cursor;
287 357
358 /*
359 * Periodically drop the lock (if held) regardless of its
360 * contention, to give chance to IRQs. Abort if fatal signal
361 * pending or async compaction detects need_resched()
362 */
363 if (!(blockpfn % SWAP_CLUSTER_MAX)
364 && compact_unlock_should_abort(&cc->zone->lock, flags,
365 &locked, cc))
366 break;
367
288 nr_scanned++; 368 nr_scanned++;
289 if (!pfn_valid_within(blockpfn)) 369 if (!pfn_valid_within(blockpfn))
290 goto isolate_fail; 370 goto isolate_fail;
@@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
295 goto isolate_fail; 375 goto isolate_fail;
296 376
297 /* 377 /*
298 * The zone lock must be held to isolate freepages. 378 * If we already hold the lock, we can skip some rechecking.
299 * Unfortunately this is a very coarse lock and can be 379 * Note that if we hold the lock now, checked_pageblock was
300 * heavily contended if there are parallel allocations 380 * already set in some previous iteration (or strict is true),
301 * or parallel compactions. For async compaction do not 381 * so it is correct to skip the suitable migration target
302 * spin on the lock and we acquire the lock as late as 382 * recheck as well.
303 * possible.
304 */ 383 */
305 locked = compact_checklock_irqsave(&cc->zone->lock, &flags, 384 if (!locked) {
306 locked, cc);
307 if (!locked)
308 break;
309
310 /* Recheck this is a suitable migration target under lock */
311 if (!strict && !checked_pageblock) {
312 /* 385 /*
313 * We need to check suitability of pageblock only once 386 * The zone lock must be held to isolate freepages.
314 * and this isolate_freepages_block() is called with 387 * Unfortunately this is a very coarse lock and can be
315 * pageblock range, so just check once is sufficient. 388 * heavily contended if there are parallel allocations
389 * or parallel compactions. For async compaction do not
390 * spin on the lock and we acquire the lock as late as
391 * possible.
316 */ 392 */
317 checked_pageblock = true; 393 locked = compact_trylock_irqsave(&cc->zone->lock,
318 if (!suitable_migration_target(page)) 394 &flags, cc);
395 if (!locked)
319 break; 396 break;
320 }
321 397
322 /* Recheck this is a buddy page under lock */ 398 /* Recheck this is a buddy page under lock */
323 if (!PageBuddy(page)) 399 if (!PageBuddy(page))
324 goto isolate_fail; 400 goto isolate_fail;
401 }
325 402
326 /* Found a free page, break it into order-0 pages */ 403 /* Found a free page, break it into order-0 pages */
327 isolated = split_free_page(page); 404 isolated = split_free_page(page);
@@ -346,6 +423,9 @@ isolate_fail:
346 423
347 } 424 }
348 425
426 /* Record how far we have got within the block */
427 *start_pfn = blockpfn;
428
349 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 429 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
350 430
351 /* 431 /*
@@ -361,8 +441,7 @@ isolate_fail:
361 441
362 /* Update the pageblock-skip if the whole pageblock was scanned */ 442 /* Update the pageblock-skip if the whole pageblock was scanned */
363 if (blockpfn == end_pfn) 443 if (blockpfn == end_pfn)
364 update_pageblock_skip(cc, valid_page, total_isolated, true, 444 update_pageblock_skip(cc, valid_page, total_isolated, false);
365 false);
366 445
367 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 446 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
368 if (total_isolated) 447 if (total_isolated)
@@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc,
390 unsigned long isolated, pfn, block_end_pfn; 469 unsigned long isolated, pfn, block_end_pfn;
391 LIST_HEAD(freelist); 470 LIST_HEAD(freelist);
392 471
393 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { 472 pfn = start_pfn;
394 if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) 473 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
395 break; 474
475 for (; pfn < end_pfn; pfn += isolated,
476 block_end_pfn += pageblock_nr_pages) {
477 /* Protect pfn from changing by isolate_freepages_block */
478 unsigned long isolate_start_pfn = pfn;
396 479
397 /*
398 * On subsequent iterations ALIGN() is actually not needed,
399 * but we keep it that we not to complicate the code.
400 */
401 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
402 block_end_pfn = min(block_end_pfn, end_pfn); 480 block_end_pfn = min(block_end_pfn, end_pfn);
403 481
404 isolated = isolate_freepages_block(cc, pfn, block_end_pfn, 482 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
405 &freelist, true); 483 break;
484
485 isolated = isolate_freepages_block(cc, &isolate_start_pfn,
486 block_end_pfn, &freelist, true);
406 487
407 /* 488 /*
408 * In strict mode, isolate_freepages_block() returns 0 if 489 * In strict mode, isolate_freepages_block() returns 0 if
@@ -433,22 +514,19 @@ isolate_freepages_range(struct compact_control *cc,
433} 514}
434 515
435/* Update the number of anon and file isolated pages in the zone */ 516/* Update the number of anon and file isolated pages in the zone */
436static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) 517static void acct_isolated(struct zone *zone, struct compact_control *cc)
437{ 518{
438 struct page *page; 519 struct page *page;
439 unsigned int count[2] = { 0, }; 520 unsigned int count[2] = { 0, };
440 521
522 if (list_empty(&cc->migratepages))
523 return;
524
441 list_for_each_entry(page, &cc->migratepages, lru) 525 list_for_each_entry(page, &cc->migratepages, lru)
442 count[!!page_is_file_cache(page)]++; 526 count[!!page_is_file_cache(page)]++;
443 527
444 /* If locked we can use the interrupt unsafe versions */ 528 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
445 if (locked) { 529 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
446 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
447 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
448 } else {
449 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
450 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
451 }
452} 530}
453 531
454/* Similar to reclaim, but different enough that they don't share logic */ 532/* Similar to reclaim, but different enough that they don't share logic */
@@ -467,40 +545,34 @@ static bool too_many_isolated(struct zone *zone)
467} 545}
468 546
469/** 547/**
470 * isolate_migratepages_range() - isolate all migrate-able pages in range. 548 * isolate_migratepages_block() - isolate all migrate-able pages within
471 * @zone: Zone pages are in. 549 * a single pageblock
472 * @cc: Compaction control structure. 550 * @cc: Compaction control structure.
473 * @low_pfn: The first PFN of the range. 551 * @low_pfn: The first PFN to isolate
474 * @end_pfn: The one-past-the-last PFN of the range. 552 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
475 * @unevictable: true if it allows to isolate unevictable pages 553 * @isolate_mode: Isolation mode to be used.
476 * 554 *
477 * Isolate all pages that can be migrated from the range specified by 555 * Isolate all pages that can be migrated from the range specified by
478 * [low_pfn, end_pfn). Returns zero if there is a fatal signal 556 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
479 * pending), otherwise PFN of the first page that was not scanned 557 * Returns zero if there is a fatal signal pending, otherwise PFN of the
480 * (which may be both less, equal to or more then end_pfn). 558 * first page that was not scanned (which may be both less, equal to or more
559 * than end_pfn).
481 * 560 *
482 * Assumes that cc->migratepages is empty and cc->nr_migratepages is 561 * The pages are isolated on cc->migratepages list (not required to be empty),
483 * zero. 562 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
484 * 563 * is neither read nor updated.
485 * Apart from cc->migratepages and cc->nr_migratetypes this function
486 * does not modify any cc's fields, in particular it does not modify
487 * (or read for that matter) cc->migrate_pfn.
488 */ 564 */
489unsigned long 565static unsigned long
490isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 566isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
491 unsigned long low_pfn, unsigned long end_pfn, bool unevictable) 567 unsigned long end_pfn, isolate_mode_t isolate_mode)
492{ 568{
493 unsigned long last_pageblock_nr = 0, pageblock_nr; 569 struct zone *zone = cc->zone;
494 unsigned long nr_scanned = 0, nr_isolated = 0; 570 unsigned long nr_scanned = 0, nr_isolated = 0;
495 struct list_head *migratelist = &cc->migratepages; 571 struct list_head *migratelist = &cc->migratepages;
496 struct lruvec *lruvec; 572 struct lruvec *lruvec;
497 unsigned long flags; 573 unsigned long flags = 0;
498 bool locked = false; 574 bool locked = false;
499 struct page *page = NULL, *valid_page = NULL; 575 struct page *page = NULL, *valid_page = NULL;
500 bool set_unsuitable = true;
501 const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
502 ISOLATE_ASYNC_MIGRATE : 0) |
503 (unevictable ? ISOLATE_UNEVICTABLE : 0);
504 576
505 /* 577 /*
506 * Ensure that there are not too many pages isolated from the LRU 578 * Ensure that there are not too many pages isolated from the LRU
@@ -523,72 +595,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
523 595
524 /* Time to isolate some pages for migration */ 596 /* Time to isolate some pages for migration */
525 for (; low_pfn < end_pfn; low_pfn++) { 597 for (; low_pfn < end_pfn; low_pfn++) {
526 /* give a chance to irqs before checking need_resched() */
527 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
528 if (should_release_lock(&zone->lru_lock)) {
529 spin_unlock_irqrestore(&zone->lru_lock, flags);
530 locked = false;
531 }
532 }
533
534 /* 598 /*
535 * migrate_pfn does not necessarily start aligned to a 599 * Periodically drop the lock (if held) regardless of its
536 * pageblock. Ensure that pfn_valid is called when moving 600 * contention, to give chance to IRQs. Abort async compaction
537 * into a new MAX_ORDER_NR_PAGES range in case of large 601 * if contended.
538 * memory holes within the zone
539 */ 602 */
540 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { 603 if (!(low_pfn % SWAP_CLUSTER_MAX)
541 if (!pfn_valid(low_pfn)) { 604 && compact_unlock_should_abort(&zone->lru_lock, flags,
542 low_pfn += MAX_ORDER_NR_PAGES - 1; 605 &locked, cc))
543 continue; 606 break;
544 }
545 }
546 607
547 if (!pfn_valid_within(low_pfn)) 608 if (!pfn_valid_within(low_pfn))
548 continue; 609 continue;
549 nr_scanned++; 610 nr_scanned++;
550 611
551 /*
552 * Get the page and ensure the page is within the same zone.
553 * See the comment in isolate_freepages about overlapping
554 * nodes. It is deliberate that the new zone lock is not taken
555 * as memory compaction should not move pages between nodes.
556 */
557 page = pfn_to_page(low_pfn); 612 page = pfn_to_page(low_pfn);
558 if (page_zone(page) != zone)
559 continue;
560 613
561 if (!valid_page) 614 if (!valid_page)
562 valid_page = page; 615 valid_page = page;
563 616
564 /* If isolation recently failed, do not retry */ 617 /*
565 pageblock_nr = low_pfn >> pageblock_order; 618 * Skip if free. We read page order here without zone lock
566 if (last_pageblock_nr != pageblock_nr) { 619 * which is generally unsafe, but the race window is small and
567 int mt; 620 * the worst thing that can happen is that we skip some
568 621 * potential isolation targets.
569 last_pageblock_nr = pageblock_nr; 622 */
570 if (!isolation_suitable(cc, page)) 623 if (PageBuddy(page)) {
571 goto next_pageblock; 624 unsigned long freepage_order = page_order_unsafe(page);
572 625
573 /* 626 /*
574 * For async migration, also only scan in MOVABLE 627 * Without lock, we cannot be sure that what we got is
575 * blocks. Async migration is optimistic to see if 628 * a valid page order. Consider only values in the
576 * the minimum amount of work satisfies the allocation 629 * valid order range to prevent low_pfn overflow.
577 */ 630 */
578 mt = get_pageblock_migratetype(page); 631 if (freepage_order > 0 && freepage_order < MAX_ORDER)
579 if (cc->mode == MIGRATE_ASYNC && 632 low_pfn += (1UL << freepage_order) - 1;
580 !migrate_async_suitable(mt)) {
581 set_unsuitable = false;
582 goto next_pageblock;
583 }
584 }
585
586 /*
587 * Skip if free. page_order cannot be used without zone->lock
588 * as nothing prevents parallel allocations or buddy merging.
589 */
590 if (PageBuddy(page))
591 continue; 633 continue;
634 }
592 635
593 /* 636 /*
594 * Check may be lockless but that's ok as we recheck later. 637 * Check may be lockless but that's ok as we recheck later.
@@ -597,7 +640,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
597 */ 640 */
598 if (!PageLRU(page)) { 641 if (!PageLRU(page)) {
599 if (unlikely(balloon_page_movable(page))) { 642 if (unlikely(balloon_page_movable(page))) {
600 if (locked && balloon_page_isolate(page)) { 643 if (balloon_page_isolate(page)) {
601 /* Successfully isolated */ 644 /* Successfully isolated */
602 goto isolate_success; 645 goto isolate_success;
603 } 646 }
@@ -617,8 +660,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
617 */ 660 */
618 if (PageTransHuge(page)) { 661 if (PageTransHuge(page)) {
619 if (!locked) 662 if (!locked)
620 goto next_pageblock; 663 low_pfn = ALIGN(low_pfn + 1,
621 low_pfn += (1 << compound_order(page)) - 1; 664 pageblock_nr_pages) - 1;
665 else
666 low_pfn += (1 << compound_order(page)) - 1;
667
622 continue; 668 continue;
623 } 669 }
624 670
@@ -631,24 +677,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
631 page_count(page) > page_mapcount(page)) 677 page_count(page) > page_mapcount(page))
632 continue; 678 continue;
633 679
634 /* Check if it is ok to still hold the lock */ 680 /* If we already hold the lock, we can skip some rechecking */
635 locked = compact_checklock_irqsave(&zone->lru_lock, &flags, 681 if (!locked) {
636 locked, cc); 682 locked = compact_trylock_irqsave(&zone->lru_lock,
637 if (!locked || fatal_signal_pending(current)) 683 &flags, cc);
638 break; 684 if (!locked)
685 break;
639 686
640 /* Recheck PageLRU and PageTransHuge under lock */ 687 /* Recheck PageLRU and PageTransHuge under lock */
641 if (!PageLRU(page)) 688 if (!PageLRU(page))
642 continue; 689 continue;
643 if (PageTransHuge(page)) { 690 if (PageTransHuge(page)) {
644 low_pfn += (1 << compound_order(page)) - 1; 691 low_pfn += (1 << compound_order(page)) - 1;
645 continue; 692 continue;
693 }
646 } 694 }
647 695
648 lruvec = mem_cgroup_page_lruvec(page, zone); 696 lruvec = mem_cgroup_page_lruvec(page, zone);
649 697
650 /* Try isolate the page */ 698 /* Try isolate the page */
651 if (__isolate_lru_page(page, mode) != 0) 699 if (__isolate_lru_page(page, isolate_mode) != 0)
652 continue; 700 continue;
653 701
654 VM_BUG_ON_PAGE(PageTransCompound(page), page); 702 VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -667,14 +715,14 @@ isolate_success:
667 ++low_pfn; 715 ++low_pfn;
668 break; 716 break;
669 } 717 }
670
671 continue;
672
673next_pageblock:
674 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
675 } 718 }
676 719
677 acct_isolated(zone, locked, cc); 720 /*
721 * The PageBuddy() check could have potentially brought us outside
722 * the range to be scanned.
723 */
724 if (unlikely(low_pfn > end_pfn))
725 low_pfn = end_pfn;
678 726
679 if (locked) 727 if (locked)
680 spin_unlock_irqrestore(&zone->lru_lock, flags); 728 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -684,8 +732,7 @@ next_pageblock:
684 * if the whole pageblock was scanned without isolating any page. 732 * if the whole pageblock was scanned without isolating any page.
685 */ 733 */
686 if (low_pfn == end_pfn) 734 if (low_pfn == end_pfn)
687 update_pageblock_skip(cc, valid_page, nr_isolated, 735 update_pageblock_skip(cc, valid_page, nr_isolated, true);
688 set_unsuitable, true);
689 736
690 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 737 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
691 738
@@ -696,17 +743,65 @@ next_pageblock:
696 return low_pfn; 743 return low_pfn;
697} 744}
698 745
746/**
747 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
748 * @cc: Compaction control structure.
749 * @start_pfn: The first PFN to start isolating.
750 * @end_pfn: The one-past-last PFN.
751 *
752 * Returns zero if isolation fails fatally due to e.g. pending signal.
753 * Otherwise, function returns one-past-the-last PFN of isolated page
754 * (which may be greater than end_pfn if end fell in a middle of a THP page).
755 */
756unsigned long
757isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
758 unsigned long end_pfn)
759{
760 unsigned long pfn, block_end_pfn;
761
762 /* Scan block by block. First and last block may be incomplete */
763 pfn = start_pfn;
764 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
765
766 for (; pfn < end_pfn; pfn = block_end_pfn,
767 block_end_pfn += pageblock_nr_pages) {
768
769 block_end_pfn = min(block_end_pfn, end_pfn);
770
771 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
772 continue;
773
774 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
775 ISOLATE_UNEVICTABLE);
776
777 /*
778 * In case of fatal failure, release everything that might
779 * have been isolated in the previous iteration, and signal
780 * the failure back to caller.
781 */
782 if (!pfn) {
783 putback_movable_pages(&cc->migratepages);
784 cc->nr_migratepages = 0;
785 break;
786 }
787 }
788 acct_isolated(cc->zone, cc);
789
790 return pfn;
791}
792
699#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 793#endif /* CONFIG_COMPACTION || CONFIG_CMA */
700#ifdef CONFIG_COMPACTION 794#ifdef CONFIG_COMPACTION
701/* 795/*
702 * Based on information in the current compact_control, find blocks 796 * Based on information in the current compact_control, find blocks
703 * suitable for isolating free pages from and then isolate them. 797 * suitable for isolating free pages from and then isolate them.
704 */ 798 */
705static void isolate_freepages(struct zone *zone, 799static void isolate_freepages(struct compact_control *cc)
706 struct compact_control *cc)
707{ 800{
801 struct zone *zone = cc->zone;
708 struct page *page; 802 struct page *page;
709 unsigned long block_start_pfn; /* start of current pageblock */ 803 unsigned long block_start_pfn; /* start of current pageblock */
804 unsigned long isolate_start_pfn; /* exact pfn we start at */
710 unsigned long block_end_pfn; /* end of current pageblock */ 805 unsigned long block_end_pfn; /* end of current pageblock */
711 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 806 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
712 int nr_freepages = cc->nr_freepages; 807 int nr_freepages = cc->nr_freepages;
@@ -715,14 +810,15 @@ static void isolate_freepages(struct zone *zone,
715 /* 810 /*
716 * Initialise the free scanner. The starting point is where we last 811 * Initialise the free scanner. The starting point is where we last
717 * successfully isolated from, zone-cached value, or the end of the 812 * successfully isolated from, zone-cached value, or the end of the
718 * zone when isolating for the first time. We need this aligned to 813 * zone when isolating for the first time. For looping we also need
719 * the pageblock boundary, because we do 814 * this pfn aligned down to the pageblock boundary, because we do
720 * block_start_pfn -= pageblock_nr_pages in the for loop. 815 * block_start_pfn -= pageblock_nr_pages in the for loop.
721 * For ending point, take care when isolating in last pageblock of a 816 * For ending point, take care when isolating in last pageblock of a
722 * a zone which ends in the middle of a pageblock. 817 * a zone which ends in the middle of a pageblock.
723 * The low boundary is the end of the pageblock the migration scanner 818 * The low boundary is the end of the pageblock the migration scanner
724 * is using. 819 * is using.
725 */ 820 */
821 isolate_start_pfn = cc->free_pfn;
726 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 822 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
727 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 823 block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
728 zone_end_pfn(zone)); 824 zone_end_pfn(zone));
@@ -735,7 +831,8 @@ static void isolate_freepages(struct zone *zone,
735 */ 831 */
736 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 832 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
737 block_end_pfn = block_start_pfn, 833 block_end_pfn = block_start_pfn,
738 block_start_pfn -= pageblock_nr_pages) { 834 block_start_pfn -= pageblock_nr_pages,
835 isolate_start_pfn = block_start_pfn) {
739 unsigned long isolated; 836 unsigned long isolated;
740 837
741 /* 838 /*
@@ -747,18 +844,9 @@ static void isolate_freepages(struct zone *zone,
747 && compact_should_abort(cc)) 844 && compact_should_abort(cc))
748 break; 845 break;
749 846
750 if (!pfn_valid(block_start_pfn)) 847 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
751 continue; 848 zone);
752 849 if (!page)
753 /*
754 * Check for overlapping nodes/zones. It's possible on some
755 * configurations to have a setup like
756 * node0 node1 node0
757 * i.e. it's possible that all pages within a zones range of
758 * pages do not belong to a single zone.
759 */
760 page = pfn_to_page(block_start_pfn);
761 if (page_zone(page) != zone)
762 continue; 850 continue;
763 851
764 /* Check the block is suitable for migration */ 852 /* Check the block is suitable for migration */
@@ -769,13 +857,25 @@ static void isolate_freepages(struct zone *zone,
769 if (!isolation_suitable(cc, page)) 857 if (!isolation_suitable(cc, page))
770 continue; 858 continue;
771 859
772 /* Found a block suitable for isolating free pages from */ 860 /* Found a block suitable for isolating free pages from. */
773 cc->free_pfn = block_start_pfn; 861 isolated = isolate_freepages_block(cc, &isolate_start_pfn,
774 isolated = isolate_freepages_block(cc, block_start_pfn,
775 block_end_pfn, freelist, false); 862 block_end_pfn, freelist, false);
776 nr_freepages += isolated; 863 nr_freepages += isolated;
777 864
778 /* 865 /*
866 * Remember where the free scanner should restart next time,
867 * which is where isolate_freepages_block() left off.
868 * But if it scanned the whole pageblock, isolate_start_pfn
869 * now points at block_end_pfn, which is the start of the next
870 * pageblock.
871 * In that case we will however want to restart at the start
872 * of the previous pageblock.
873 */
874 cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
875 isolate_start_pfn :
876 block_start_pfn - pageblock_nr_pages;
877
878 /*
779 * Set a flag that we successfully isolated in this pageblock. 879 * Set a flag that we successfully isolated in this pageblock.
780 * In the next loop iteration, zone->compact_cached_free_pfn 880 * In the next loop iteration, zone->compact_cached_free_pfn
781 * will not be updated and thus it will effectively contain the 881 * will not be updated and thus it will effectively contain the
@@ -822,7 +922,7 @@ static struct page *compaction_alloc(struct page *migratepage,
822 */ 922 */
823 if (list_empty(&cc->freepages)) { 923 if (list_empty(&cc->freepages)) {
824 if (!cc->contended) 924 if (!cc->contended)
825 isolate_freepages(cc->zone, cc); 925 isolate_freepages(cc);
826 926
827 if (list_empty(&cc->freepages)) 927 if (list_empty(&cc->freepages))
828 return NULL; 928 return NULL;
@@ -856,38 +956,84 @@ typedef enum {
856} isolate_migrate_t; 956} isolate_migrate_t;
857 957
858/* 958/*
859 * Isolate all pages that can be migrated from the block pointed to by 959 * Isolate all pages that can be migrated from the first suitable block,
860 * the migrate scanner within compact_control. 960 * starting at the block pointed to by the migrate scanner pfn within
961 * compact_control.
861 */ 962 */
862static isolate_migrate_t isolate_migratepages(struct zone *zone, 963static isolate_migrate_t isolate_migratepages(struct zone *zone,
863 struct compact_control *cc) 964 struct compact_control *cc)
864{ 965{
865 unsigned long low_pfn, end_pfn; 966 unsigned long low_pfn, end_pfn;
967 struct page *page;
968 const isolate_mode_t isolate_mode =
969 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
866 970
867 /* Do not scan outside zone boundaries */ 971 /*
868 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 972 * Start at where we last stopped, or beginning of the zone as
973 * initialized by compact_zone()
974 */
975 low_pfn = cc->migrate_pfn;
869 976
870 /* Only scan within a pageblock boundary */ 977 /* Only scan within a pageblock boundary */
871 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 978 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
872 979
873 /* Do not cross the free scanner or scan within a memory hole */ 980 /*
874 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 981 * Iterate over whole pageblocks until we find the first suitable.
875 cc->migrate_pfn = end_pfn; 982 * Do not cross the free scanner.
876 return ISOLATE_NONE; 983 */
877 } 984 for (; end_pfn <= cc->free_pfn;
985 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
878 986
879 /* Perform the isolation */ 987 /*
880 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); 988 * This can potentially iterate a massively long zone with
881 if (!low_pfn || cc->contended) 989 * many pageblocks unsuitable, so periodically check if we
882 return ISOLATE_ABORT; 990 * need to schedule, or even abort async compaction.
991 */
992 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
993 && compact_should_abort(cc))
994 break;
995
996 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
997 if (!page)
998 continue;
999
1000 /* If isolation recently failed, do not retry */
1001 if (!isolation_suitable(cc, page))
1002 continue;
1003
1004 /*
1005 * For async compaction, also only scan in MOVABLE blocks.
1006 * Async compaction is optimistic to see if the minimum amount
1007 * of work satisfies the allocation.
1008 */
1009 if (cc->mode == MIGRATE_ASYNC &&
1010 !migrate_async_suitable(get_pageblock_migratetype(page)))
1011 continue;
1012
1013 /* Perform the isolation */
1014 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
1015 isolate_mode);
883 1016
1017 if (!low_pfn || cc->contended)
1018 return ISOLATE_ABORT;
1019
1020 /*
1021 * Either we isolated something and proceed with migration. Or
1022 * we failed and compact_zone should decide if we should
1023 * continue or not.
1024 */
1025 break;
1026 }
1027
1028 acct_isolated(zone, cc);
1029 /* Record where migration scanner will be restarted */
884 cc->migrate_pfn = low_pfn; 1030 cc->migrate_pfn = low_pfn;
885 1031
886 return ISOLATE_SUCCESS; 1032 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
887} 1033}
888 1034
889static int compact_finished(struct zone *zone, 1035static int compact_finished(struct zone *zone, struct compact_control *cc,
890 struct compact_control *cc) 1036 const int migratetype)
891{ 1037{
892 unsigned int order; 1038 unsigned int order;
893 unsigned long watermark; 1039 unsigned long watermark;
@@ -933,7 +1079,7 @@ static int compact_finished(struct zone *zone,
933 struct free_area *area = &zone->free_area[order]; 1079 struct free_area *area = &zone->free_area[order];
934 1080
935 /* Job done if page is free of the right migratetype */ 1081 /* Job done if page is free of the right migratetype */
936 if (!list_empty(&area->free_list[cc->migratetype])) 1082 if (!list_empty(&area->free_list[migratetype]))
937 return COMPACT_PARTIAL; 1083 return COMPACT_PARTIAL;
938 1084
939 /* Job done if allocation would set block type */ 1085 /* Job done if allocation would set block type */
@@ -999,6 +1145,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
999 int ret; 1145 int ret;
1000 unsigned long start_pfn = zone->zone_start_pfn; 1146 unsigned long start_pfn = zone->zone_start_pfn;
1001 unsigned long end_pfn = zone_end_pfn(zone); 1147 unsigned long end_pfn = zone_end_pfn(zone);
1148 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1002 const bool sync = cc->mode != MIGRATE_ASYNC; 1149 const bool sync = cc->mode != MIGRATE_ASYNC;
1003 1150
1004 ret = compaction_suitable(zone, cc->order); 1151 ret = compaction_suitable(zone, cc->order);
@@ -1041,7 +1188,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1041 1188
1042 migrate_prep_local(); 1189 migrate_prep_local();
1043 1190
1044 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1191 while ((ret = compact_finished(zone, cc, migratetype)) ==
1192 COMPACT_CONTINUE) {
1045 int err; 1193 int err;
1046 1194
1047 switch (isolate_migratepages(zone, cc)) { 1195 switch (isolate_migratepages(zone, cc)) {
@@ -1056,9 +1204,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1056 ; 1204 ;
1057 } 1205 }
1058 1206
1059 if (!cc->nr_migratepages)
1060 continue;
1061
1062 err = migrate_pages(&cc->migratepages, compaction_alloc, 1207 err = migrate_pages(&cc->migratepages, compaction_alloc,
1063 compaction_free, (unsigned long)cc, cc->mode, 1208 compaction_free, (unsigned long)cc, cc->mode,
1064 MR_COMPACTION); 1209 MR_COMPACTION);
@@ -1092,14 +1237,14 @@ out:
1092} 1237}
1093 1238
1094static unsigned long compact_zone_order(struct zone *zone, int order, 1239static unsigned long compact_zone_order(struct zone *zone, int order,
1095 gfp_t gfp_mask, enum migrate_mode mode, bool *contended) 1240 gfp_t gfp_mask, enum migrate_mode mode, int *contended)
1096{ 1241{
1097 unsigned long ret; 1242 unsigned long ret;
1098 struct compact_control cc = { 1243 struct compact_control cc = {
1099 .nr_freepages = 0, 1244 .nr_freepages = 0,
1100 .nr_migratepages = 0, 1245 .nr_migratepages = 0,
1101 .order = order, 1246 .order = order,
1102 .migratetype = allocflags_to_migratetype(gfp_mask), 1247 .gfp_mask = gfp_mask,
1103 .zone = zone, 1248 .zone = zone,
1104 .mode = mode, 1249 .mode = mode,
1105 }; 1250 };
@@ -1124,48 +1269,117 @@ int sysctl_extfrag_threshold = 500;
1124 * @gfp_mask: The GFP mask of the current allocation 1269 * @gfp_mask: The GFP mask of the current allocation
1125 * @nodemask: The allowed nodes to allocate from 1270 * @nodemask: The allowed nodes to allocate from
1126 * @mode: The migration mode for async, sync light, or sync migration 1271 * @mode: The migration mode for async, sync light, or sync migration
1127 * @contended: Return value that is true if compaction was aborted due to lock contention 1272 * @contended: Return value that determines if compaction was aborted due to
1128 * @page: Optionally capture a free page of the requested order during compaction 1273 * need_resched() or lock contention
1274 * @candidate_zone: Return the zone where we think allocation should succeed
1129 * 1275 *
1130 * This is the main entry point for direct page compaction. 1276 * This is the main entry point for direct page compaction.
1131 */ 1277 */
1132unsigned long try_to_compact_pages(struct zonelist *zonelist, 1278unsigned long try_to_compact_pages(struct zonelist *zonelist,
1133 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1279 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1134 enum migrate_mode mode, bool *contended) 1280 enum migrate_mode mode, int *contended,
1281 struct zone **candidate_zone)
1135{ 1282{
1136 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1283 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1137 int may_enter_fs = gfp_mask & __GFP_FS; 1284 int may_enter_fs = gfp_mask & __GFP_FS;
1138 int may_perform_io = gfp_mask & __GFP_IO; 1285 int may_perform_io = gfp_mask & __GFP_IO;
1139 struct zoneref *z; 1286 struct zoneref *z;
1140 struct zone *zone; 1287 struct zone *zone;
1141 int rc = COMPACT_SKIPPED; 1288 int rc = COMPACT_DEFERRED;
1142 int alloc_flags = 0; 1289 int alloc_flags = 0;
1290 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
1291
1292 *contended = COMPACT_CONTENDED_NONE;
1143 1293
1144 /* Check if the GFP flags allow compaction */ 1294 /* Check if the GFP flags allow compaction */
1145 if (!order || !may_enter_fs || !may_perform_io) 1295 if (!order || !may_enter_fs || !may_perform_io)
1146 return rc; 1296 return COMPACT_SKIPPED;
1147
1148 count_compact_event(COMPACTSTALL);
1149 1297
1150#ifdef CONFIG_CMA 1298#ifdef CONFIG_CMA
1151 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 1299 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1152 alloc_flags |= ALLOC_CMA; 1300 alloc_flags |= ALLOC_CMA;
1153#endif 1301#endif
1154 /* Compact each zone in the list */ 1302 /* Compact each zone in the list */
1155 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1303 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1156 nodemask) { 1304 nodemask) {
1157 int status; 1305 int status;
1306 int zone_contended;
1307
1308 if (compaction_deferred(zone, order))
1309 continue;
1158 1310
1159 status = compact_zone_order(zone, order, gfp_mask, mode, 1311 status = compact_zone_order(zone, order, gfp_mask, mode,
1160 contended); 1312 &zone_contended);
1161 rc = max(status, rc); 1313 rc = max(status, rc);
1314 /*
1315 * It takes at least one zone that wasn't lock contended
1316 * to clear all_zones_contended.
1317 */
1318 all_zones_contended &= zone_contended;
1162 1319
1163 /* If a normal allocation would succeed, stop compacting */ 1320 /* If a normal allocation would succeed, stop compacting */
1164 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 1321 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
1165 alloc_flags)) 1322 alloc_flags)) {
1166 break; 1323 *candidate_zone = zone;
1324 /*
1325 * We think the allocation will succeed in this zone,
1326 * but it is not certain, hence the false. The caller
1327 * will repeat this with true if allocation indeed
1328 * succeeds in this zone.
1329 */
1330 compaction_defer_reset(zone, order, false);
1331 /*
1332 * It is possible that async compaction aborted due to
1333 * need_resched() and the watermarks were ok thanks to
1334 * somebody else freeing memory. The allocation can
1335 * however still fail so we better signal the
1336 * need_resched() contention anyway (this will not
1337 * prevent the allocation attempt).
1338 */
1339 if (zone_contended == COMPACT_CONTENDED_SCHED)
1340 *contended = COMPACT_CONTENDED_SCHED;
1341
1342 goto break_loop;
1343 }
1344
1345 if (mode != MIGRATE_ASYNC) {
1346 /*
1347 * We think that allocation won't succeed in this zone
1348 * so we defer compaction there. If it ends up
1349 * succeeding after all, it will be reset.
1350 */
1351 defer_compaction(zone, order);
1352 }
1353
1354 /*
1355 * We might have stopped compacting due to need_resched() in
1356 * async compaction, or due to a fatal signal detected. In that
1357 * case do not try further zones and signal need_resched()
1358 * contention.
1359 */
1360 if ((zone_contended == COMPACT_CONTENDED_SCHED)
1361 || fatal_signal_pending(current)) {
1362 *contended = COMPACT_CONTENDED_SCHED;
1363 goto break_loop;
1364 }
1365
1366 continue;
1367break_loop:
1368 /*
1369 * We might not have tried all the zones, so be conservative
1370 * and assume they are not all lock contended.
1371 */
1372 all_zones_contended = 0;
1373 break;
1167 } 1374 }
1168 1375
1376 /*
1377 * If at least one zone wasn't deferred or skipped, we report if all
1378 * zones that were tried were lock contended.
1379 */
1380 if (rc > COMPACT_SKIPPED && all_zones_contended)
1381 *contended = COMPACT_CONTENDED_LOCK;
1382
1169 return rc; 1383 return rc;
1170} 1384}
1171 1385
diff --git a/mm/debug.c b/mm/debug.c
new file mode 100644
index 000000000000..5ce45c9a29b5
--- /dev/null
+++ b/mm/debug.c
@@ -0,0 +1,237 @@
1/*
2 * mm/debug.c
3 *
4 * mm/ specific debug routines.
5 *
6 */
7
8#include <linux/kernel.h>
9#include <linux/mm.h>
10#include <linux/ftrace_event.h>
11#include <linux/memcontrol.h>
12
13static const struct trace_print_flags pageflag_names[] = {
14 {1UL << PG_locked, "locked" },
15 {1UL << PG_error, "error" },
16 {1UL << PG_referenced, "referenced" },
17 {1UL << PG_uptodate, "uptodate" },
18 {1UL << PG_dirty, "dirty" },
19 {1UL << PG_lru, "lru" },
20 {1UL << PG_active, "active" },
21 {1UL << PG_slab, "slab" },
22 {1UL << PG_owner_priv_1, "owner_priv_1" },
23 {1UL << PG_arch_1, "arch_1" },
24 {1UL << PG_reserved, "reserved" },
25 {1UL << PG_private, "private" },
26 {1UL << PG_private_2, "private_2" },
27 {1UL << PG_writeback, "writeback" },
28#ifdef CONFIG_PAGEFLAGS_EXTENDED
29 {1UL << PG_head, "head" },
30 {1UL << PG_tail, "tail" },
31#else
32 {1UL << PG_compound, "compound" },
33#endif
34 {1UL << PG_swapcache, "swapcache" },
35 {1UL << PG_mappedtodisk, "mappedtodisk" },
36 {1UL << PG_reclaim, "reclaim" },
37 {1UL << PG_swapbacked, "swapbacked" },
38 {1UL << PG_unevictable, "unevictable" },
39#ifdef CONFIG_MMU
40 {1UL << PG_mlocked, "mlocked" },
41#endif
42#ifdef CONFIG_ARCH_USES_PG_UNCACHED
43 {1UL << PG_uncached, "uncached" },
44#endif
45#ifdef CONFIG_MEMORY_FAILURE
46 {1UL << PG_hwpoison, "hwpoison" },
47#endif
48#ifdef CONFIG_TRANSPARENT_HUGEPAGE
49 {1UL << PG_compound_lock, "compound_lock" },
50#endif
51};
52
53static void dump_flags(unsigned long flags,
54 const struct trace_print_flags *names, int count)
55{
56 const char *delim = "";
57 unsigned long mask;
58 int i;
59
60 pr_emerg("flags: %#lx(", flags);
61
62 /* remove zone id */
63 flags &= (1UL << NR_PAGEFLAGS) - 1;
64
65 for (i = 0; i < count && flags; i++) {
66
67 mask = names[i].mask;
68 if ((flags & mask) != mask)
69 continue;
70
71 flags &= ~mask;
72 pr_cont("%s%s", delim, names[i].name);
73 delim = "|";
74 }
75
76 /* check for left over flags */
77 if (flags)
78 pr_cont("%s%#lx", delim, flags);
79
80 pr_cont(")\n");
81}
82
83void dump_page_badflags(struct page *page, const char *reason,
84 unsigned long badflags)
85{
86 pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
87 page, atomic_read(&page->_count), page_mapcount(page),
88 page->mapping, page->index);
89 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
90 dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
91 if (reason)
92 pr_alert("page dumped because: %s\n", reason);
93 if (page->flags & badflags) {
94 pr_alert("bad because of flags:\n");
95 dump_flags(page->flags & badflags,
96 pageflag_names, ARRAY_SIZE(pageflag_names));
97 }
98 mem_cgroup_print_bad_page(page);
99}
100
101void dump_page(struct page *page, const char *reason)
102{
103 dump_page_badflags(page, reason, 0);
104}
105EXPORT_SYMBOL(dump_page);
106
107#ifdef CONFIG_DEBUG_VM
108
109static const struct trace_print_flags vmaflags_names[] = {
110 {VM_READ, "read" },
111 {VM_WRITE, "write" },
112 {VM_EXEC, "exec" },
113 {VM_SHARED, "shared" },
114 {VM_MAYREAD, "mayread" },
115 {VM_MAYWRITE, "maywrite" },
116 {VM_MAYEXEC, "mayexec" },
117 {VM_MAYSHARE, "mayshare" },
118 {VM_GROWSDOWN, "growsdown" },
119 {VM_PFNMAP, "pfnmap" },
120 {VM_DENYWRITE, "denywrite" },
121 {VM_LOCKED, "locked" },
122 {VM_IO, "io" },
123 {VM_SEQ_READ, "seqread" },
124 {VM_RAND_READ, "randread" },
125 {VM_DONTCOPY, "dontcopy" },
126 {VM_DONTEXPAND, "dontexpand" },
127 {VM_ACCOUNT, "account" },
128 {VM_NORESERVE, "noreserve" },
129 {VM_HUGETLB, "hugetlb" },
130 {VM_NONLINEAR, "nonlinear" },
131#if defined(CONFIG_X86)
132 {VM_PAT, "pat" },
133#elif defined(CONFIG_PPC)
134 {VM_SAO, "sao" },
135#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
136 {VM_GROWSUP, "growsup" },
137#elif !defined(CONFIG_MMU)
138 {VM_MAPPED_COPY, "mappedcopy" },
139#else
140 {VM_ARCH_1, "arch_1" },
141#endif
142 {VM_DONTDUMP, "dontdump" },
143#ifdef CONFIG_MEM_SOFT_DIRTY
144 {VM_SOFTDIRTY, "softdirty" },
145#endif
146 {VM_MIXEDMAP, "mixedmap" },
147 {VM_HUGEPAGE, "hugepage" },
148 {VM_NOHUGEPAGE, "nohugepage" },
149 {VM_MERGEABLE, "mergeable" },
150};
151
152void dump_vma(const struct vm_area_struct *vma)
153{
154 pr_emerg("vma %p start %p end %p\n"
155 "next %p prev %p mm %p\n"
156 "prot %lx anon_vma %p vm_ops %p\n"
157 "pgoff %lx file %p private_data %p\n",
158 vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
159 vma->vm_prev, vma->vm_mm,
160 (unsigned long)pgprot_val(vma->vm_page_prot),
161 vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
162 vma->vm_file, vma->vm_private_data);
163 dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
164}
165EXPORT_SYMBOL(dump_vma);
166
167void dump_mm(const struct mm_struct *mm)
168{
169 pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
170#ifdef CONFIG_MMU
171 "get_unmapped_area %p\n"
172#endif
173 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
174 "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
175 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
176 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
177 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
178 "start_brk %lx brk %lx start_stack %lx\n"
179 "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
180 "binfmt %p flags %lx core_state %p\n"
181#ifdef CONFIG_AIO
182 "ioctx_table %p\n"
183#endif
184#ifdef CONFIG_MEMCG
185 "owner %p "
186#endif
187 "exe_file %p\n"
188#ifdef CONFIG_MMU_NOTIFIER
189 "mmu_notifier_mm %p\n"
190#endif
191#ifdef CONFIG_NUMA_BALANCING
192 "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
193#endif
194#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
195 "tlb_flush_pending %d\n"
196#endif
197 "%s", /* This is here to hold the comma */
198
199 mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
200#ifdef CONFIG_MMU
201 mm->get_unmapped_area,
202#endif
203 mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
204 mm->pgd, atomic_read(&mm->mm_users),
205 atomic_read(&mm->mm_count),
206 atomic_long_read((atomic_long_t *)&mm->nr_ptes),
207 mm->map_count,
208 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
209 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
210 mm->start_code, mm->end_code, mm->start_data, mm->end_data,
211 mm->start_brk, mm->brk, mm->start_stack,
212 mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
213 mm->binfmt, mm->flags, mm->core_state,
214#ifdef CONFIG_AIO
215 mm->ioctx_table,
216#endif
217#ifdef CONFIG_MEMCG
218 mm->owner,
219#endif
220 mm->exe_file,
221#ifdef CONFIG_MMU_NOTIFIER
222 mm->mmu_notifier_mm,
223#endif
224#ifdef CONFIG_NUMA_BALANCING
225 mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
226#endif
227#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
228 mm->tlb_flush_pending,
229#endif
230 "" /* This is here to not have a comma! */
231 );
232
233 dump_flags(mm->def_flags, vmaflags_names,
234 ARRAY_SIZE(vmaflags_names));
235}
236
237#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 306baa594f95..fd5fe4342e93 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -62,6 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
62}; 62};
63 63
64static DEFINE_MUTEX(pools_lock); 64static DEFINE_MUTEX(pools_lock);
65static DEFINE_MUTEX(pools_reg_lock);
65 66
66static ssize_t 67static ssize_t
67show_pools(struct device *dev, struct device_attribute *attr, char *buf) 68show_pools(struct device *dev, struct device_attribute *attr, char *buf)
@@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
132{ 133{
133 struct dma_pool *retval; 134 struct dma_pool *retval;
134 size_t allocation; 135 size_t allocation;
136 bool empty = false;
135 137
136 if (align == 0) { 138 if (align == 0)
137 align = 1; 139 align = 1;
138 } else if (align & (align - 1)) { 140 else if (align & (align - 1))
139 return NULL; 141 return NULL;
140 }
141 142
142 if (size == 0) { 143 if (size == 0)
143 return NULL; 144 return NULL;
144 } else if (size < 4) { 145 else if (size < 4)
145 size = 4; 146 size = 4;
146 }
147 147
148 if ((size % align) != 0) 148 if ((size % align) != 0)
149 size = ALIGN(size, align); 149 size = ALIGN(size, align);
150 150
151 allocation = max_t(size_t, size, PAGE_SIZE); 151 allocation = max_t(size_t, size, PAGE_SIZE);
152 152
153 if (!boundary) { 153 if (!boundary)
154 boundary = allocation; 154 boundary = allocation;
155 } else if ((boundary < size) || (boundary & (boundary - 1))) { 155 else if ((boundary < size) || (boundary & (boundary - 1)))
156 return NULL; 156 return NULL;
157 }
158 157
159 retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); 158 retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
160 if (!retval) 159 if (!retval)
@@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
172 171
173 INIT_LIST_HEAD(&retval->pools); 172 INIT_LIST_HEAD(&retval->pools);
174 173
174 /*
175 * pools_lock ensures that the ->dma_pools list does not get corrupted.
176 * pools_reg_lock ensures that there is not a race between
177 * dma_pool_create() and dma_pool_destroy() or within dma_pool_create()
178 * when the first invocation of dma_pool_create() failed on
179 * device_create_file() and the second assumes that it has been done (I
180 * know it is a short window).
181 */
182 mutex_lock(&pools_reg_lock);
175 mutex_lock(&pools_lock); 183 mutex_lock(&pools_lock);
176 if (list_empty(&dev->dma_pools) && 184 if (list_empty(&dev->dma_pools))
177 device_create_file(dev, &dev_attr_pools)) { 185 empty = true;
178 kfree(retval); 186 list_add(&retval->pools, &dev->dma_pools);
179 return NULL;
180 } else
181 list_add(&retval->pools, &dev->dma_pools);
182 mutex_unlock(&pools_lock); 187 mutex_unlock(&pools_lock);
183 188 if (empty) {
189 int err;
190
191 err = device_create_file(dev, &dev_attr_pools);
192 if (err) {
193 mutex_lock(&pools_lock);
194 list_del(&retval->pools);
195 mutex_unlock(&pools_lock);
196 mutex_unlock(&pools_reg_lock);
197 kfree(retval);
198 return NULL;
199 }
200 }
201 mutex_unlock(&pools_reg_lock);
184 return retval; 202 return retval;
185} 203}
186EXPORT_SYMBOL(dma_pool_create); 204EXPORT_SYMBOL(dma_pool_create);
@@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
251 */ 269 */
252void dma_pool_destroy(struct dma_pool *pool) 270void dma_pool_destroy(struct dma_pool *pool)
253{ 271{
272 bool empty = false;
273
274 mutex_lock(&pools_reg_lock);
254 mutex_lock(&pools_lock); 275 mutex_lock(&pools_lock);
255 list_del(&pool->pools); 276 list_del(&pool->pools);
256 if (pool->dev && list_empty(&pool->dev->dma_pools)) 277 if (pool->dev && list_empty(&pool->dev->dma_pools))
257 device_remove_file(pool->dev, &dev_attr_pools); 278 empty = true;
258 mutex_unlock(&pools_lock); 279 mutex_unlock(&pools_lock);
280 if (empty)
281 device_remove_file(pool->dev, &dev_attr_pools);
282 mutex_unlock(&pools_reg_lock);
259 283
260 while (!list_empty(&pool->page_list)) { 284 while (!list_empty(&pool->page_list)) {
261 struct dma_page *page; 285 struct dma_page *page;
diff --git a/mm/filemap.c b/mm/filemap.c
index 90effcdf948d..14b4642279f1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc);
670 * at a cost of "thundering herd" phenomena during rare hash 670 * at a cost of "thundering herd" phenomena during rare hash
671 * collisions. 671 * collisions.
672 */ 672 */
673static wait_queue_head_t *page_waitqueue(struct page *page) 673wait_queue_head_t *page_waitqueue(struct page *page)
674{ 674{
675 const struct zone *zone = page_zone(page); 675 const struct zone *zone = page_zone(page);
676 676
677 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 677 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
678} 678}
679 679EXPORT_SYMBOL(page_waitqueue);
680static inline void wake_up_page(struct page *page, int bit)
681{
682 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
683}
684 680
685void wait_on_page_bit(struct page *page, int bit_nr) 681void wait_on_page_bit(struct page *page, int bit_nr)
686{ 682{
@@ -703,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
703 bit_wait_io, TASK_KILLABLE); 699 bit_wait_io, TASK_KILLABLE);
704} 700}
705 701
702int wait_on_page_bit_killable_timeout(struct page *page,
703 int bit_nr, unsigned long timeout)
704{
705 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
706
707 wait.key.timeout = jiffies + timeout;
708 if (!test_bit(bit_nr, &page->flags))
709 return 0;
710 return __wait_on_bit(page_waitqueue(page), &wait,
711 bit_wait_io_timeout, TASK_KILLABLE);
712}
713EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
714
706/** 715/**
707 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 716 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
708 * @page: Page defining the wait queue of interest 717 * @page: Page defining the wait queue of interest
@@ -727,7 +736,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
727 * 736 *
728 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 737 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
729 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 738 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
730 * mechananism between PageLocked pages and PageWriteback pages is shared. 739 * mechanism between PageLocked pages and PageWriteback pages is shared.
731 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 740 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
732 * 741 *
733 * The mb is necessary to enforce ordering between the clear_bit and the read 742 * The mb is necessary to enforce ordering between the clear_bit and the read
@@ -1744,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
1744static int page_cache_read(struct file *file, pgoff_t offset) 1753static int page_cache_read(struct file *file, pgoff_t offset)
1745{ 1754{
1746 struct address_space *mapping = file->f_mapping; 1755 struct address_space *mapping = file->f_mapping;
1747 struct page *page; 1756 struct page *page;
1748 int ret; 1757 int ret;
1749 1758
1750 do { 1759 do {
@@ -1761,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1761 page_cache_release(page); 1770 page_cache_release(page);
1762 1771
1763 } while (ret == AOP_TRUNCATED_PAGE); 1772 } while (ret == AOP_TRUNCATED_PAGE);
1764 1773
1765 return ret; 1774 return ret;
1766} 1775}
1767 1776
diff --git a/mm/gup.c b/mm/gup.c
index 91d044b1600d..cd62c8c90d4a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,10 @@
10#include <linux/swap.h> 10#include <linux/swap.h>
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12 12
13#include <linux/sched.h>
14#include <linux/rwsem.h>
15#include <asm/pgtable.h>
16
13#include "internal.h" 17#include "internal.h"
14 18
15static struct page *no_page_table(struct vm_area_struct *vma, 19static struct page *no_page_table(struct vm_area_struct *vma,
@@ -281,6 +285,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
281 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 285 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
282 if (*flags & FOLL_NOWAIT) 286 if (*flags & FOLL_NOWAIT)
283 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 287 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
288 if (*flags & FOLL_TRIED) {
289 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
290 fault_flags |= FAULT_FLAG_TRIED;
291 }
284 292
285 ret = handle_mm_fault(mm, vma, address, fault_flags); 293 ret = handle_mm_fault(mm, vma, address, fault_flags);
286 if (ret & VM_FAULT_ERROR) { 294 if (ret & VM_FAULT_ERROR) {
@@ -672,3 +680,353 @@ struct page *get_dump_page(unsigned long addr)
672 return page; 680 return page;
673} 681}
674#endif /* CONFIG_ELF_CORE */ 682#endif /* CONFIG_ELF_CORE */
683
684/*
685 * Generic RCU Fast GUP
686 *
687 * get_user_pages_fast attempts to pin user pages by walking the page
688 * tables directly and avoids taking locks. Thus the walker needs to be
689 * protected from page table pages being freed from under it, and should
690 * block any THP splits.
691 *
692 * One way to achieve this is to have the walker disable interrupts, and
693 * rely on IPIs from the TLB flushing code blocking before the page table
694 * pages are freed. This is unsuitable for architectures that do not need
695 * to broadcast an IPI when invalidating TLBs.
696 *
697 * Another way to achieve this is to batch up page table containing pages
698 * belonging to more than one mm_user, then rcu_sched a callback to free those
699 * pages. Disabling interrupts will allow the fast_gup walker to both block
700 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
701 * (which is a relatively rare event). The code below adopts this strategy.
702 *
703 * Before activating this code, please be aware that the following assumptions
704 * are currently made:
705 *
706 * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
707 * pages containing page tables.
708 *
709 * *) THP splits will broadcast an IPI, this can be achieved by overriding
710 * pmdp_splitting_flush.
711 *
712 * *) ptes can be read atomically by the architecture.
713 *
714 * *) access_ok is sufficient to validate userspace address ranges.
715 *
716 * The last two assumptions can be relaxed by the addition of helper functions.
717 *
718 * This code is based heavily on the PowerPC implementation by Nick Piggin.
719 */
720#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
721
722#ifdef __HAVE_ARCH_PTE_SPECIAL
723static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
724 int write, struct page **pages, int *nr)
725{
726 pte_t *ptep, *ptem;
727 int ret = 0;
728
729 ptem = ptep = pte_offset_map(&pmd, addr);
730 do {
731 /*
732 * In the line below we are assuming that the pte can be read
733 * atomically. If this is not the case for your architecture,
734 * please wrap this in a helper function!
735 *
736 * for an example see gup_get_pte in arch/x86/mm/gup.c
737 */
738 pte_t pte = ACCESS_ONCE(*ptep);
739 struct page *page;
740
741 /*
742 * Similar to the PMD case below, NUMA hinting must take slow
743 * path
744 */
745 if (!pte_present(pte) || pte_special(pte) ||
746 pte_numa(pte) || (write && !pte_write(pte)))
747 goto pte_unmap;
748
749 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
750 page = pte_page(pte);
751
752 if (!page_cache_get_speculative(page))
753 goto pte_unmap;
754
755 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
756 put_page(page);
757 goto pte_unmap;
758 }
759
760 pages[*nr] = page;
761 (*nr)++;
762
763 } while (ptep++, addr += PAGE_SIZE, addr != end);
764
765 ret = 1;
766
767pte_unmap:
768 pte_unmap(ptem);
769 return ret;
770}
771#else
772
773/*
774 * If we can't determine whether or not a pte is special, then fail immediately
775 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
776 * to be special.
777 *
778 * For a futex to be placed on a THP tail page, get_futex_key requires a
779 * __get_user_pages_fast implementation that can pin pages. Thus it's still
780 * useful to have gup_huge_pmd even if we can't operate on ptes.
781 */
782static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
783 int write, struct page **pages, int *nr)
784{
785 return 0;
786}
787#endif /* __HAVE_ARCH_PTE_SPECIAL */
788
789static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
790 unsigned long end, int write, struct page **pages, int *nr)
791{
792 struct page *head, *page, *tail;
793 int refs;
794
795 if (write && !pmd_write(orig))
796 return 0;
797
798 refs = 0;
799 head = pmd_page(orig);
800 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
801 tail = page;
802 do {
803 VM_BUG_ON_PAGE(compound_head(page) != head, page);
804 pages[*nr] = page;
805 (*nr)++;
806 page++;
807 refs++;
808 } while (addr += PAGE_SIZE, addr != end);
809
810 if (!page_cache_add_speculative(head, refs)) {
811 *nr -= refs;
812 return 0;
813 }
814
815 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
816 *nr -= refs;
817 while (refs--)
818 put_page(head);
819 return 0;
820 }
821
822 /*
823 * Any tail pages need their mapcount reference taken before we
824 * return. (This allows the THP code to bump their ref count when
825 * they are split into base pages).
826 */
827 while (refs--) {
828 if (PageTail(tail))
829 get_huge_page_tail(tail);
830 tail++;
831 }
832
833 return 1;
834}
835
836static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
837 unsigned long end, int write, struct page **pages, int *nr)
838{
839 struct page *head, *page, *tail;
840 int refs;
841
842 if (write && !pud_write(orig))
843 return 0;
844
845 refs = 0;
846 head = pud_page(orig);
847 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
848 tail = page;
849 do {
850 VM_BUG_ON_PAGE(compound_head(page) != head, page);
851 pages[*nr] = page;
852 (*nr)++;
853 page++;
854 refs++;
855 } while (addr += PAGE_SIZE, addr != end);
856
857 if (!page_cache_add_speculative(head, refs)) {
858 *nr -= refs;
859 return 0;
860 }
861
862 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
863 *nr -= refs;
864 while (refs--)
865 put_page(head);
866 return 0;
867 }
868
869 while (refs--) {
870 if (PageTail(tail))
871 get_huge_page_tail(tail);
872 tail++;
873 }
874
875 return 1;
876}
877
878static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
879 int write, struct page **pages, int *nr)
880{
881 unsigned long next;
882 pmd_t *pmdp;
883
884 pmdp = pmd_offset(&pud, addr);
885 do {
886 pmd_t pmd = ACCESS_ONCE(*pmdp);
887
888 next = pmd_addr_end(addr, end);
889 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
890 return 0;
891
892 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
893 /*
894 * NUMA hinting faults need to be handled in the GUP
895 * slowpath for accounting purposes and so that they
896 * can be serialised against THP migration.
897 */
898 if (pmd_numa(pmd))
899 return 0;
900
901 if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
902 pages, nr))
903 return 0;
904
905 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
906 return 0;
907 } while (pmdp++, addr = next, addr != end);
908
909 return 1;
910}
911
912static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
913 int write, struct page **pages, int *nr)
914{
915 unsigned long next;
916 pud_t *pudp;
917
918 pudp = pud_offset(pgdp, addr);
919 do {
920 pud_t pud = ACCESS_ONCE(*pudp);
921
922 next = pud_addr_end(addr, end);
923 if (pud_none(pud))
924 return 0;
925 if (pud_huge(pud)) {
926 if (!gup_huge_pud(pud, pudp, addr, next, write,
927 pages, nr))
928 return 0;
929 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
930 return 0;
931 } while (pudp++, addr = next, addr != end);
932
933 return 1;
934}
935
936/*
937 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
938 * the regular GUP. It will only return non-negative values.
939 */
940int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
941 struct page **pages)
942{
943 struct mm_struct *mm = current->mm;
944 unsigned long addr, len, end;
945 unsigned long next, flags;
946 pgd_t *pgdp;
947 int nr = 0;
948
949 start &= PAGE_MASK;
950 addr = start;
951 len = (unsigned long) nr_pages << PAGE_SHIFT;
952 end = start + len;
953
954 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
955 start, len)))
956 return 0;
957
958 /*
959 * Disable interrupts. We use the nested form as we can already have
960 * interrupts disabled by get_futex_key.
961 *
962 * With interrupts disabled, we block page table pages from being
963 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
964 * for more details.
965 *
966 * We do not adopt an rcu_read_lock(.) here as we also want to
967 * block IPIs that come from THPs splitting.
968 */
969
970 local_irq_save(flags);
971 pgdp = pgd_offset(mm, addr);
972 do {
973 next = pgd_addr_end(addr, end);
974 if (pgd_none(*pgdp))
975 break;
976 else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
977 break;
978 } while (pgdp++, addr = next, addr != end);
979 local_irq_restore(flags);
980
981 return nr;
982}
983
984/**
985 * get_user_pages_fast() - pin user pages in memory
986 * @start: starting user address
987 * @nr_pages: number of pages from start to pin
988 * @write: whether pages will be written to
989 * @pages: array that receives pointers to the pages pinned.
990 * Should be at least nr_pages long.
991 *
992 * Attempt to pin user pages in memory without taking mm->mmap_sem.
993 * If not successful, it will fall back to taking the lock and
994 * calling get_user_pages().
995 *
996 * Returns number of pages pinned. This may be fewer than the number
997 * requested. If nr_pages is 0 or negative, returns 0. If no pages
998 * were pinned, returns -errno.
999 */
1000int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1001 struct page **pages)
1002{
1003 struct mm_struct *mm = current->mm;
1004 int nr, ret;
1005
1006 start &= PAGE_MASK;
1007 nr = __get_user_pages_fast(start, nr_pages, write, pages);
1008 ret = nr;
1009
1010 if (nr < nr_pages) {
1011 /* Try to get the remaining pages with get_user_pages */
1012 start += nr << PAGE_SHIFT;
1013 pages += nr;
1014
1015 down_read(&mm->mmap_sem);
1016 ret = get_user_pages(current, mm, start,
1017 nr_pages - nr, write, 0, pages, NULL);
1018 up_read(&mm->mmap_sem);
1019
1020 /* Have to be a bit careful with return values */
1021 if (nr > 0) {
1022 if (ret < 0)
1023 ret = nr;
1024 else
1025 ret += nr;
1026 }
1027 }
1028
1029 return ret;
1030}
1031
1032#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06b862..74c78aa8bc2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1096 unsigned long mmun_end; /* For mmu_notifiers */ 1096 unsigned long mmun_end; /* For mmu_notifiers */
1097 1097
1098 ptl = pmd_lockptr(mm, pmd); 1098 ptl = pmd_lockptr(mm, pmd);
1099 VM_BUG_ON(!vma->anon_vma); 1099 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1100 haddr = address & HPAGE_PMD_MASK; 1100 haddr = address & HPAGE_PMD_MASK;
1101 if (is_huge_zero_pmd(orig_pmd)) 1101 if (is_huge_zero_pmd(orig_pmd))
1102 goto alloc; 1102 goto alloc;
@@ -1795,14 +1795,17 @@ static int __split_huge_page_map(struct page *page,
1795 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1795 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1796 pte_t *pte, entry; 1796 pte_t *pte, entry;
1797 BUG_ON(PageCompound(page+i)); 1797 BUG_ON(PageCompound(page+i));
1798 /*
1799 * Note that pmd_numa is not transferred deliberately
1800 * to avoid any possibility that pte_numa leaks to
1801 * a PROT_NONE VMA by accident.
1802 */
1798 entry = mk_pte(page + i, vma->vm_page_prot); 1803 entry = mk_pte(page + i, vma->vm_page_prot);
1799 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1804 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1800 if (!pmd_write(*pmd)) 1805 if (!pmd_write(*pmd))
1801 entry = pte_wrprotect(entry); 1806 entry = pte_wrprotect(entry);
1802 if (!pmd_young(*pmd)) 1807 if (!pmd_young(*pmd))
1803 entry = pte_mkold(entry); 1808 entry = pte_mkold(entry);
1804 if (pmd_numa(*pmd))
1805 entry = pte_mknuma(entry);
1806 pte = pte_offset_map(&_pmd, haddr); 1809 pte = pte_offset_map(&_pmd, haddr);
1807 BUG_ON(!pte_none(*pte)); 1810 BUG_ON(!pte_none(*pte));
1808 set_pte_at(mm, haddr, pte, entry); 1811 set_pte_at(mm, haddr, pte, entry);
@@ -2045,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm)
2045 return -ENOMEM; 2048 return -ENOMEM;
2046 2049
2047 /* __khugepaged_exit() must not run from under us */ 2050 /* __khugepaged_exit() must not run from under us */
2048 VM_BUG_ON(khugepaged_test_exit(mm)); 2051 VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
2049 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 2052 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
2050 free_mm_slot(mm_slot); 2053 free_mm_slot(mm_slot);
2051 return 0; 2054 return 0;
@@ -2080,7 +2083,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
2080 if (vma->vm_ops) 2083 if (vma->vm_ops)
2081 /* khugepaged not yet working on file or special mappings */ 2084 /* khugepaged not yet working on file or special mappings */
2082 return 0; 2085 return 0;
2083 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2086 VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
2084 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2087 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2085 hend = vma->vm_end & HPAGE_PMD_MASK; 2088 hend = vma->vm_end & HPAGE_PMD_MASK;
2086 if (hstart < hend) 2089 if (hstart < hend)
@@ -2319,23 +2322,17 @@ static struct page
2319 int node) 2322 int node)
2320{ 2323{
2321 VM_BUG_ON_PAGE(*hpage, *hpage); 2324 VM_BUG_ON_PAGE(*hpage, *hpage);
2325
2322 /* 2326 /*
2323 * Allocate the page while the vma is still valid and under 2327 * Before allocating the hugepage, release the mmap_sem read lock.
2324 * the mmap_sem read mode so there is no memory allocation 2328 * The allocation can take potentially a long time if it involves
2325 * later when we take the mmap_sem in write mode. This is more 2329 * sync compaction, and we do not need to hold the mmap_sem during
2326 * friendly behavior (OTOH it may actually hide bugs) to 2330 * that. We will recheck the vma after taking it again in write mode.
2327 * filesystems in userland with daemons allocating memory in
2328 * the userland I/O paths. Allocating memory with the
2329 * mmap_sem in read mode is good idea also to allow greater
2330 * scalability.
2331 */ 2331 */
2332 up_read(&mm->mmap_sem);
2333
2332 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( 2334 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2333 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); 2335 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2334 /*
2335 * After allocating the hugepage, release the mmap_sem read lock in
2336 * preparation for taking it in write mode.
2337 */
2338 up_read(&mm->mmap_sem);
2339 if (unlikely(!*hpage)) { 2336 if (unlikely(!*hpage)) {
2340 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2337 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2341 *hpage = ERR_PTR(-ENOMEM); 2338 *hpage = ERR_PTR(-ENOMEM);
@@ -2409,7 +2406,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
2409 return false; 2406 return false;
2410 if (is_vma_temporary_stack(vma)) 2407 if (is_vma_temporary_stack(vma))
2411 return false; 2408 return false;
2412 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 2409 VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
2413 return true; 2410 return true;
2414} 2411}
2415 2412
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eeceeeb09019..9fd722769927 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode)
434 434
435static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 435static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
436{ 436{
437 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 437 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
438 if (vma->vm_flags & VM_MAYSHARE) { 438 if (vma->vm_flags & VM_MAYSHARE) {
439 struct address_space *mapping = vma->vm_file->f_mapping; 439 struct address_space *mapping = vma->vm_file->f_mapping;
440 struct inode *inode = mapping->host; 440 struct inode *inode = mapping->host;
@@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
449 449
450static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 450static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
451{ 451{
452 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 452 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
453 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 453 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
454 454
455 set_vma_private_data(vma, (get_vma_private_data(vma) & 455 set_vma_private_data(vma, (get_vma_private_data(vma) &
456 HPAGE_RESV_MASK) | (unsigned long)map); 456 HPAGE_RESV_MASK) | (unsigned long)map);
@@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
458 458
459static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 459static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
460{ 460{
461 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 461 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
462 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); 462 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
463 463
464 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 464 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
465} 465}
466 466
467static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 467static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
468{ 468{
469 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 469 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
470 470
471 return (get_vma_private_data(vma) & flag) != 0; 471 return (get_vma_private_data(vma) & flag) != 0;
472} 472}
@@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
474/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 474/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
475void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 475void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
476{ 476{
477 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 477 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
478 if (!(vma->vm_flags & VM_MAYSHARE)) 478 if (!(vma->vm_flags & VM_MAYSHARE))
479 vma->vm_private_data = (void *)0; 479 vma->vm_private_data = (void *)0;
480} 480}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 9eebfadeeee1..a67c26e0f360 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -217,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
217 217
218 if (hugetlb_cgroup_disabled()) 218 if (hugetlb_cgroup_disabled())
219 return; 219 return;
220 VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); 220 lockdep_assert_held(&hugetlb_lock);
221 h_cg = hugetlb_cgroup_from_page(page); 221 h_cg = hugetlb_cgroup_from_page(page);
222 if (unlikely(!h_cg)) 222 if (unlikely(!h_cg))
223 return; 223 return;
diff --git a/mm/internal.h b/mm/internal.h
index a1b651b11c5f..829304090b90 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -142,10 +142,10 @@ struct compact_control {
142 bool finished_update_migrate; 142 bool finished_update_migrate;
143 143
144 int order; /* order a direct compactor needs */ 144 int order; /* order a direct compactor needs */
145 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 145 const gfp_t gfp_mask; /* gfp mask of a direct compactor */
146 struct zone *zone; 146 struct zone *zone;
147 bool contended; /* True if a lock was contended, or 147 int contended; /* Signal need_sched() or lock
148 * need_resched() true during async 148 * contention detected during
149 * compaction 149 * compaction
150 */ 150 */
151}; 151};
@@ -154,8 +154,8 @@ unsigned long
154isolate_freepages_range(struct compact_control *cc, 154isolate_freepages_range(struct compact_control *cc,
155 unsigned long start_pfn, unsigned long end_pfn); 155 unsigned long start_pfn, unsigned long end_pfn);
156unsigned long 156unsigned long
157isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 157isolate_migratepages_range(struct compact_control *cc,
158 unsigned long low_pfn, unsigned long end_pfn, bool unevictable); 158 unsigned long low_pfn, unsigned long end_pfn);
159 159
160#endif 160#endif
161 161
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
164 * general, page_zone(page)->lock must be held by the caller to prevent the 164 * general, page_zone(page)->lock must be held by the caller to prevent the
165 * page from being allocated in parallel and returning garbage as the order. 165 * page from being allocated in parallel and returning garbage as the order.
166 * If a caller does not hold page_zone(page)->lock, it must guarantee that the 166 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
167 * page cannot be allocated or merged in parallel. 167 * page cannot be allocated or merged in parallel. Alternatively, it must
168 * handle invalid values gracefully, and use page_order_unsafe() below.
168 */ 169 */
169static inline unsigned long page_order(struct page *page) 170static inline unsigned long page_order(struct page *page)
170{ 171{
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page)
172 return page_private(page); 173 return page_private(page);
173} 174}
174 175
176/*
177 * Like page_order(), but for callers who cannot afford to hold the zone lock.
178 * PageBuddy() should be checked first by the caller to minimize race window,
179 * and invalid values must be handled gracefully.
180 *
181 * ACCESS_ONCE is used so that if the caller assigns the result into a local
182 * variable and e.g. tests it for valid range before using, the compiler cannot
183 * decide to remove the variable and inline the page_private(page) multiple
184 * times, potentially observing different values in the tests and the actual
185 * use of the result.
186 */
187#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
188
175static inline bool is_cow_mapping(vm_flags_t flags) 189static inline bool is_cow_mapping(vm_flags_t flags)
176{ 190{
177 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 191 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 4a5822a586e6..8da581fa9060 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
34 struct vm_area_struct *parent; 34 struct vm_area_struct *parent;
35 unsigned long last = vma_last_pgoff(node); 35 unsigned long last = vma_last_pgoff(node);
36 36
37 VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); 37 VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
38 38
39 if (!prev->shared.linear.rb.rb_right) { 39 if (!prev->shared.linear.rb.rb_right) {
40 parent = prev; 40 parent = prev;
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index ab88dc0ea1d3..9a09f2034fcc 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
310EXPORT_SYMBOL(iov_iter_init); 310EXPORT_SYMBOL(iov_iter_init);
311 311
312static ssize_t get_pages_iovec(struct iov_iter *i, 312static ssize_t get_pages_iovec(struct iov_iter *i,
313 struct page **pages, unsigned maxpages, 313 struct page **pages, size_t maxsize, unsigned maxpages,
314 size_t *start) 314 size_t *start)
315{ 315{
316 size_t offset = i->iov_offset; 316 size_t offset = i->iov_offset;
@@ -323,6 +323,8 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
323 len = iov->iov_len - offset; 323 len = iov->iov_len - offset;
324 if (len > i->count) 324 if (len > i->count)
325 len = i->count; 325 len = i->count;
326 if (len > maxsize)
327 len = maxsize;
326 addr = (unsigned long)iov->iov_base + offset; 328 addr = (unsigned long)iov->iov_base + offset;
327 len += *start = addr & (PAGE_SIZE - 1); 329 len += *start = addr & (PAGE_SIZE - 1);
328 if (len > maxpages * PAGE_SIZE) 330 if (len > maxpages * PAGE_SIZE)
@@ -588,13 +590,15 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
588} 590}
589 591
590static ssize_t get_pages_bvec(struct iov_iter *i, 592static ssize_t get_pages_bvec(struct iov_iter *i,
591 struct page **pages, unsigned maxpages, 593 struct page **pages, size_t maxsize, unsigned maxpages,
592 size_t *start) 594 size_t *start)
593{ 595{
594 const struct bio_vec *bvec = i->bvec; 596 const struct bio_vec *bvec = i->bvec;
595 size_t len = bvec->bv_len - i->iov_offset; 597 size_t len = bvec->bv_len - i->iov_offset;
596 if (len > i->count) 598 if (len > i->count)
597 len = i->count; 599 len = i->count;
600 if (len > maxsize)
601 len = maxsize;
598 /* can't be more than PAGE_SIZE */ 602 /* can't be more than PAGE_SIZE */
599 *start = bvec->bv_offset + i->iov_offset; 603 *start = bvec->bv_offset + i->iov_offset;
600 604
@@ -711,13 +715,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
711EXPORT_SYMBOL(iov_iter_alignment); 715EXPORT_SYMBOL(iov_iter_alignment);
712 716
713ssize_t iov_iter_get_pages(struct iov_iter *i, 717ssize_t iov_iter_get_pages(struct iov_iter *i,
714 struct page **pages, unsigned maxpages, 718 struct page **pages, size_t maxsize, unsigned maxpages,
715 size_t *start) 719 size_t *start)
716{ 720{
717 if (i->type & ITER_BVEC) 721 if (i->type & ITER_BVEC)
718 return get_pages_bvec(i, pages, maxpages, start); 722 return get_pages_bvec(i, pages, maxsize, maxpages, start);
719 else 723 else
720 return get_pages_iovec(i, pages, maxpages, start); 724 return get_pages_iovec(i, pages, maxsize, maxpages, start);
721} 725}
722EXPORT_SYMBOL(iov_iter_get_pages); 726EXPORT_SYMBOL(iov_iter_get_pages);
723 727
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index fd814fd61319..cab58bb592d8 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -2,6 +2,7 @@
2#include <linux/mm_types.h> 2#include <linux/mm_types.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include "slab.h"
5#include <linux/kmemcheck.h> 6#include <linux/kmemcheck.h>
6 7
7void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) 8void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
diff --git a/mm/ksm.c b/mm/ksm.c
index fb7590222706..6b2e337bc03c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2310,7 +2310,7 @@ static int __init ksm_init(void)
2310 2310
2311 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 2311 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
2312 if (IS_ERR(ksm_thread)) { 2312 if (IS_ERR(ksm_thread)) {
2313 printk(KERN_ERR "ksm: creating kthread failed\n"); 2313 pr_err("ksm: creating kthread failed\n");
2314 err = PTR_ERR(ksm_thread); 2314 err = PTR_ERR(ksm_thread);
2315 goto out_free; 2315 goto out_free;
2316 } 2316 }
@@ -2318,7 +2318,7 @@ static int __init ksm_init(void)
2318#ifdef CONFIG_SYSFS 2318#ifdef CONFIG_SYSFS
2319 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 2319 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
2320 if (err) { 2320 if (err) {
2321 printk(KERN_ERR "ksm: register sysfs failed\n"); 2321 pr_err("ksm: register sysfs failed\n");
2322 kthread_stop(ksm_thread); 2322 kthread_stop(ksm_thread);
2323 goto out_free; 2323 goto out_free;
2324 } 2324 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 6d2f219a48b0..6ecb0d937fb5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -192,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
192 phys_addr_t align, phys_addr_t start, 192 phys_addr_t align, phys_addr_t start,
193 phys_addr_t end, int nid) 193 phys_addr_t end, int nid)
194{ 194{
195 int ret; 195 phys_addr_t kernel_end, ret;
196 phys_addr_t kernel_end;
197 196
198 /* pump up @end */ 197 /* pump up @end */
199 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 198 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -817,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
817 if (nid != NUMA_NO_NODE && nid != m_nid) 816 if (nid != NUMA_NO_NODE && nid != m_nid)
818 continue; 817 continue;
819 818
819 /* skip hotpluggable memory regions if needed */
820 if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
821 continue;
822
820 if (!type_b) { 823 if (!type_b) {
821 if (out_start) 824 if (out_start)
822 *out_start = m_start; 825 *out_start = m_start;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4dcf1b9562..23976fd885fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,6 +292,9 @@ struct mem_cgroup {
292 /* vmpressure notifications */ 292 /* vmpressure notifications */
293 struct vmpressure vmpressure; 293 struct vmpressure vmpressure;
294 294
295 /* css_online() has been completed */
296 int initialized;
297
295 /* 298 /*
296 * the counter to account for mem+swap usage. 299 * the counter to account for mem+swap usage.
297 */ 300 */
@@ -315,9 +318,6 @@ struct mem_cgroup {
315 /* OOM-Killer disable */ 318 /* OOM-Killer disable */
316 int oom_kill_disable; 319 int oom_kill_disable;
317 320
318 /* set when res.limit == memsw.limit */
319 bool memsw_is_minimum;
320
321 /* protect arrays of thresholds */ 321 /* protect arrays of thresholds */
322 struct mutex thresholds_lock; 322 struct mutex thresholds_lock;
323 323
@@ -481,14 +481,6 @@ enum res_type {
481#define OOM_CONTROL (0) 481#define OOM_CONTROL (0)
482 482
483/* 483/*
484 * Reclaim flags for mem_cgroup_hierarchical_reclaim
485 */
486#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
487#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
488#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
489#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
490
491/*
492 * The memcg_create_mutex will be held whenever a new cgroup is created. 484 * The memcg_create_mutex will be held whenever a new cgroup is created.
493 * As a consequence, any change that needs to protect against new child cgroups 485 * As a consequence, any change that needs to protect against new child cgroups
494 * appearing has to hold it as well. 486 * appearing has to hold it as well.
@@ -646,11 +638,13 @@ int memcg_limited_groups_array_size;
646struct static_key memcg_kmem_enabled_key; 638struct static_key memcg_kmem_enabled_key;
647EXPORT_SYMBOL(memcg_kmem_enabled_key); 639EXPORT_SYMBOL(memcg_kmem_enabled_key);
648 640
641static void memcg_free_cache_id(int id);
642
649static void disarm_kmem_keys(struct mem_cgroup *memcg) 643static void disarm_kmem_keys(struct mem_cgroup *memcg)
650{ 644{
651 if (memcg_kmem_is_active(memcg)) { 645 if (memcg_kmem_is_active(memcg)) {
652 static_key_slow_dec(&memcg_kmem_enabled_key); 646 static_key_slow_dec(&memcg_kmem_enabled_key);
653 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 647 memcg_free_cache_id(memcg->kmemcg_id);
654 } 648 }
655 /* 649 /*
656 * This check can't live in kmem destruction function, 650 * This check can't live in kmem destruction function,
@@ -1099,10 +1093,21 @@ skip_node:
1099 * skipping css reference should be safe. 1093 * skipping css reference should be safe.
1100 */ 1094 */
1101 if (next_css) { 1095 if (next_css) {
1102 if ((next_css == &root->css) || 1096 struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
1103 ((next_css->flags & CSS_ONLINE) && 1097
1104 css_tryget_online(next_css))) 1098 if (next_css == &root->css)
1105 return mem_cgroup_from_css(next_css); 1099 return memcg;
1100
1101 if (css_tryget_online(next_css)) {
1102 /*
1103 * Make sure the memcg is initialized:
1104 * mem_cgroup_css_online() orders the the
1105 * initialization against setting the flag.
1106 */
1107 if (smp_load_acquire(&memcg->initialized))
1108 return memcg;
1109 css_put(next_css);
1110 }
1106 1111
1107 prev_css = next_css; 1112 prev_css = next_css;
1108 goto skip_node; 1113 goto skip_node;
@@ -1792,42 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1792 NULL, "Memory cgroup out of memory"); 1797 NULL, "Memory cgroup out of memory");
1793} 1798}
1794 1799
1795static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1796 gfp_t gfp_mask,
1797 unsigned long flags)
1798{
1799 unsigned long total = 0;
1800 bool noswap = false;
1801 int loop;
1802
1803 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1804 noswap = true;
1805 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1806 noswap = true;
1807
1808 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1809 if (loop)
1810 drain_all_stock_async(memcg);
1811 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1812 /*
1813 * Allow limit shrinkers, which are triggered directly
1814 * by userspace, to catch signals and stop reclaim
1815 * after minimal progress, regardless of the margin.
1816 */
1817 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1818 break;
1819 if (mem_cgroup_margin(memcg))
1820 break;
1821 /*
1822 * If nothing was reclaimed after two attempts, there
1823 * may be no reclaimable pages in this hierarchy.
1824 */
1825 if (loop && !total)
1826 break;
1827 }
1828 return total;
1829}
1830
1831/** 1800/**
1832 * test_mem_cgroup_node_reclaimable 1801 * test_mem_cgroup_node_reclaimable
1833 * @memcg: the target memcg 1802 * @memcg: the target memcg
@@ -2530,25 +2499,29 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2530 struct mem_cgroup *mem_over_limit; 2499 struct mem_cgroup *mem_over_limit;
2531 struct res_counter *fail_res; 2500 struct res_counter *fail_res;
2532 unsigned long nr_reclaimed; 2501 unsigned long nr_reclaimed;
2533 unsigned long flags = 0;
2534 unsigned long long size; 2502 unsigned long long size;
2503 bool may_swap = true;
2504 bool drained = false;
2535 int ret = 0; 2505 int ret = 0;
2536 2506
2507 if (mem_cgroup_is_root(memcg))
2508 goto done;
2537retry: 2509retry:
2538 if (consume_stock(memcg, nr_pages)) 2510 if (consume_stock(memcg, nr_pages))
2539 goto done; 2511 goto done;
2540 2512
2541 size = batch * PAGE_SIZE; 2513 size = batch * PAGE_SIZE;
2542 if (!res_counter_charge(&memcg->res, size, &fail_res)) { 2514 if (!do_swap_account ||
2543 if (!do_swap_account) 2515 !res_counter_charge(&memcg->memsw, size, &fail_res)) {
2544 goto done_restock; 2516 if (!res_counter_charge(&memcg->res, size, &fail_res))
2545 if (!res_counter_charge(&memcg->memsw, size, &fail_res))
2546 goto done_restock; 2517 goto done_restock;
2547 res_counter_uncharge(&memcg->res, size); 2518 if (do_swap_account)
2548 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2519 res_counter_uncharge(&memcg->memsw, size);
2549 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2550 } else
2551 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2520 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2521 } else {
2522 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2523 may_swap = false;
2524 }
2552 2525
2553 if (batch > nr_pages) { 2526 if (batch > nr_pages) {
2554 batch = nr_pages; 2527 batch = nr_pages;
@@ -2572,11 +2545,18 @@ retry:
2572 if (!(gfp_mask & __GFP_WAIT)) 2545 if (!(gfp_mask & __GFP_WAIT))
2573 goto nomem; 2546 goto nomem;
2574 2547
2575 nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2548 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2549 gfp_mask, may_swap);
2576 2550
2577 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2551 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2578 goto retry; 2552 goto retry;
2579 2553
2554 if (!drained) {
2555 drain_all_stock_async(mem_over_limit);
2556 drained = true;
2557 goto retry;
2558 }
2559
2580 if (gfp_mask & __GFP_NORETRY) 2560 if (gfp_mask & __GFP_NORETRY)
2581 goto nomem; 2561 goto nomem;
2582 /* 2562 /*
@@ -2611,9 +2591,7 @@ nomem:
2611 if (!(gfp_mask & __GFP_NOFAIL)) 2591 if (!(gfp_mask & __GFP_NOFAIL))
2612 return -ENOMEM; 2592 return -ENOMEM;
2613bypass: 2593bypass:
2614 memcg = root_mem_cgroup; 2594 return -EINTR;
2615 ret = -EINTR;
2616 goto retry;
2617 2595
2618done_restock: 2596done_restock:
2619 if (batch > nr_pages) 2597 if (batch > nr_pages)
@@ -2626,6 +2604,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2626{ 2604{
2627 unsigned long bytes = nr_pages * PAGE_SIZE; 2605 unsigned long bytes = nr_pages * PAGE_SIZE;
2628 2606
2607 if (mem_cgroup_is_root(memcg))
2608 return;
2609
2629 res_counter_uncharge(&memcg->res, bytes); 2610 res_counter_uncharge(&memcg->res, bytes);
2630 if (do_swap_account) 2611 if (do_swap_account)
2631 res_counter_uncharge(&memcg->memsw, bytes); 2612 res_counter_uncharge(&memcg->memsw, bytes);
@@ -2640,6 +2621,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2640{ 2621{
2641 unsigned long bytes = nr_pages * PAGE_SIZE; 2622 unsigned long bytes = nr_pages * PAGE_SIZE;
2642 2623
2624 if (mem_cgroup_is_root(memcg))
2625 return;
2626
2643 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2627 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2644 if (do_swap_account) 2628 if (do_swap_account)
2645 res_counter_uncharge_until(&memcg->memsw, 2629 res_counter_uncharge_until(&memcg->memsw,
@@ -2778,12 +2762,6 @@ static DEFINE_MUTEX(memcg_slab_mutex);
2778 2762
2779static DEFINE_MUTEX(activate_kmem_mutex); 2763static DEFINE_MUTEX(activate_kmem_mutex);
2780 2764
2781static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2782{
2783 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2784 memcg_kmem_is_active(memcg);
2785}
2786
2787/* 2765/*
2788 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2766 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2789 * in the memcg_cache_params struct. 2767 * in the memcg_cache_params struct.
@@ -2803,7 +2781,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2803 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 2781 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2804 struct memcg_cache_params *params; 2782 struct memcg_cache_params *params;
2805 2783
2806 if (!memcg_can_account_kmem(memcg)) 2784 if (!memcg_kmem_is_active(memcg))
2807 return -EIO; 2785 return -EIO;
2808 2786
2809 print_slabinfo_header(m); 2787 print_slabinfo_header(m);
@@ -2886,19 +2864,44 @@ int memcg_cache_id(struct mem_cgroup *memcg)
2886 return memcg ? memcg->kmemcg_id : -1; 2864 return memcg ? memcg->kmemcg_id : -1;
2887} 2865}
2888 2866
2889static size_t memcg_caches_array_size(int num_groups) 2867static int memcg_alloc_cache_id(void)
2890{ 2868{
2891 ssize_t size; 2869 int id, size;
2892 if (num_groups <= 0) 2870 int err;
2893 return 0; 2871
2872 id = ida_simple_get(&kmem_limited_groups,
2873 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2874 if (id < 0)
2875 return id;
2894 2876
2895 size = 2 * num_groups; 2877 if (id < memcg_limited_groups_array_size)
2878 return id;
2879
2880 /*
2881 * There's no space for the new id in memcg_caches arrays,
2882 * so we have to grow them.
2883 */
2884
2885 size = 2 * (id + 1);
2896 if (size < MEMCG_CACHES_MIN_SIZE) 2886 if (size < MEMCG_CACHES_MIN_SIZE)
2897 size = MEMCG_CACHES_MIN_SIZE; 2887 size = MEMCG_CACHES_MIN_SIZE;
2898 else if (size > MEMCG_CACHES_MAX_SIZE) 2888 else if (size > MEMCG_CACHES_MAX_SIZE)
2899 size = MEMCG_CACHES_MAX_SIZE; 2889 size = MEMCG_CACHES_MAX_SIZE;
2900 2890
2901 return size; 2891 mutex_lock(&memcg_slab_mutex);
2892 err = memcg_update_all_caches(size);
2893 mutex_unlock(&memcg_slab_mutex);
2894
2895 if (err) {
2896 ida_simple_remove(&kmem_limited_groups, id);
2897 return err;
2898 }
2899 return id;
2900}
2901
2902static void memcg_free_cache_id(int id)
2903{
2904 ida_simple_remove(&kmem_limited_groups, id);
2902} 2905}
2903 2906
2904/* 2907/*
@@ -2908,97 +2911,7 @@ static size_t memcg_caches_array_size(int num_groups)
2908 */ 2911 */
2909void memcg_update_array_size(int num) 2912void memcg_update_array_size(int num)
2910{ 2913{
2911 if (num > memcg_limited_groups_array_size) 2914 memcg_limited_groups_array_size = num;
2912 memcg_limited_groups_array_size = memcg_caches_array_size(num);
2913}
2914
2915int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
2916{
2917 struct memcg_cache_params *cur_params = s->memcg_params;
2918
2919 VM_BUG_ON(!is_root_cache(s));
2920
2921 if (num_groups > memcg_limited_groups_array_size) {
2922 int i;
2923 struct memcg_cache_params *new_params;
2924 ssize_t size = memcg_caches_array_size(num_groups);
2925
2926 size *= sizeof(void *);
2927 size += offsetof(struct memcg_cache_params, memcg_caches);
2928
2929 new_params = kzalloc(size, GFP_KERNEL);
2930 if (!new_params)
2931 return -ENOMEM;
2932
2933 new_params->is_root_cache = true;
2934
2935 /*
2936 * There is the chance it will be bigger than
2937 * memcg_limited_groups_array_size, if we failed an allocation
2938 * in a cache, in which case all caches updated before it, will
2939 * have a bigger array.
2940 *
2941 * But if that is the case, the data after
2942 * memcg_limited_groups_array_size is certainly unused
2943 */
2944 for (i = 0; i < memcg_limited_groups_array_size; i++) {
2945 if (!cur_params->memcg_caches[i])
2946 continue;
2947 new_params->memcg_caches[i] =
2948 cur_params->memcg_caches[i];
2949 }
2950
2951 /*
2952 * Ideally, we would wait until all caches succeed, and only
2953 * then free the old one. But this is not worth the extra
2954 * pointer per-cache we'd have to have for this.
2955 *
2956 * It is not a big deal if some caches are left with a size
2957 * bigger than the others. And all updates will reset this
2958 * anyway.
2959 */
2960 rcu_assign_pointer(s->memcg_params, new_params);
2961 if (cur_params)
2962 kfree_rcu(cur_params, rcu_head);
2963 }
2964 return 0;
2965}
2966
2967int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
2968 struct kmem_cache *root_cache)
2969{
2970 size_t size;
2971
2972 if (!memcg_kmem_enabled())
2973 return 0;
2974
2975 if (!memcg) {
2976 size = offsetof(struct memcg_cache_params, memcg_caches);
2977 size += memcg_limited_groups_array_size * sizeof(void *);
2978 } else
2979 size = sizeof(struct memcg_cache_params);
2980
2981 s->memcg_params = kzalloc(size, GFP_KERNEL);
2982 if (!s->memcg_params)
2983 return -ENOMEM;
2984
2985 if (memcg) {
2986 s->memcg_params->memcg = memcg;
2987 s->memcg_params->root_cache = root_cache;
2988 css_get(&memcg->css);
2989 } else
2990 s->memcg_params->is_root_cache = true;
2991
2992 return 0;
2993}
2994
2995void memcg_free_cache_params(struct kmem_cache *s)
2996{
2997 if (!s->memcg_params)
2998 return;
2999 if (!s->memcg_params->is_root_cache)
3000 css_put(&s->memcg_params->memcg->css);
3001 kfree(s->memcg_params);
3002} 2915}
3003 2916
3004static void memcg_register_cache(struct mem_cgroup *memcg, 2917static void memcg_register_cache(struct mem_cgroup *memcg,
@@ -3031,6 +2944,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
3031 if (!cachep) 2944 if (!cachep)
3032 return; 2945 return;
3033 2946
2947 css_get(&memcg->css);
3034 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2948 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3035 2949
3036 /* 2950 /*
@@ -3064,6 +2978,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
3064 list_del(&cachep->memcg_params->list); 2978 list_del(&cachep->memcg_params->list);
3065 2979
3066 kmem_cache_destroy(cachep); 2980 kmem_cache_destroy(cachep);
2981
2982 /* drop the reference taken in memcg_register_cache */
2983 css_put(&memcg->css);
3067} 2984}
3068 2985
3069/* 2986/*
@@ -3241,7 +3158,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3241 rcu_read_lock(); 3158 rcu_read_lock();
3242 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3159 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3243 3160
3244 if (!memcg_can_account_kmem(memcg)) 3161 if (!memcg_kmem_is_active(memcg))
3245 goto out; 3162 goto out;
3246 3163
3247 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 3164 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
@@ -3326,7 +3243,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3326 3243
3327 memcg = get_mem_cgroup_from_mm(current->mm); 3244 memcg = get_mem_cgroup_from_mm(current->mm);
3328 3245
3329 if (!memcg_can_account_kmem(memcg)) { 3246 if (!memcg_kmem_is_active(memcg)) {
3330 css_put(&memcg->css); 3247 css_put(&memcg->css);
3331 return true; 3248 return true;
3332 } 3249 }
@@ -3668,7 +3585,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3668 unsigned long long val) 3585 unsigned long long val)
3669{ 3586{
3670 int retry_count; 3587 int retry_count;
3671 u64 memswlimit, memlimit;
3672 int ret = 0; 3588 int ret = 0;
3673 int children = mem_cgroup_count_children(memcg); 3589 int children = mem_cgroup_count_children(memcg);
3674 u64 curusage, oldusage; 3590 u64 curusage, oldusage;
@@ -3695,31 +3611,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3695 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3611 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3696 */ 3612 */
3697 mutex_lock(&set_limit_mutex); 3613 mutex_lock(&set_limit_mutex);
3698 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3614 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
3699 if (memswlimit < val) {
3700 ret = -EINVAL; 3615 ret = -EINVAL;
3701 mutex_unlock(&set_limit_mutex); 3616 mutex_unlock(&set_limit_mutex);
3702 break; 3617 break;
3703 } 3618 }
3704 3619
3705 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3620 if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
3706 if (memlimit < val)
3707 enlarge = 1; 3621 enlarge = 1;
3708 3622
3709 ret = res_counter_set_limit(&memcg->res, val); 3623 ret = res_counter_set_limit(&memcg->res, val);
3710 if (!ret) {
3711 if (memswlimit == val)
3712 memcg->memsw_is_minimum = true;
3713 else
3714 memcg->memsw_is_minimum = false;
3715 }
3716 mutex_unlock(&set_limit_mutex); 3624 mutex_unlock(&set_limit_mutex);
3717 3625
3718 if (!ret) 3626 if (!ret)
3719 break; 3627 break;
3720 3628
3721 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3629 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3722 MEM_CGROUP_RECLAIM_SHRINK); 3630
3723 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3631 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3724 /* Usage is reduced ? */ 3632 /* Usage is reduced ? */
3725 if (curusage >= oldusage) 3633 if (curusage >= oldusage)
@@ -3737,7 +3645,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3737 unsigned long long val) 3645 unsigned long long val)
3738{ 3646{
3739 int retry_count; 3647 int retry_count;
3740 u64 memlimit, memswlimit, oldusage, curusage; 3648 u64 oldusage, curusage;
3741 int children = mem_cgroup_count_children(memcg); 3649 int children = mem_cgroup_count_children(memcg);
3742 int ret = -EBUSY; 3650 int ret = -EBUSY;
3743 int enlarge = 0; 3651 int enlarge = 0;
@@ -3756,30 +3664,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3756 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3664 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3757 */ 3665 */
3758 mutex_lock(&set_limit_mutex); 3666 mutex_lock(&set_limit_mutex);
3759 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3667 if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
3760 if (memlimit > val) {
3761 ret = -EINVAL; 3668 ret = -EINVAL;
3762 mutex_unlock(&set_limit_mutex); 3669 mutex_unlock(&set_limit_mutex);
3763 break; 3670 break;
3764 } 3671 }
3765 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3672 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
3766 if (memswlimit < val)
3767 enlarge = 1; 3673 enlarge = 1;
3768 ret = res_counter_set_limit(&memcg->memsw, val); 3674 ret = res_counter_set_limit(&memcg->memsw, val);
3769 if (!ret) {
3770 if (memlimit == val)
3771 memcg->memsw_is_minimum = true;
3772 else
3773 memcg->memsw_is_minimum = false;
3774 }
3775 mutex_unlock(&set_limit_mutex); 3675 mutex_unlock(&set_limit_mutex);
3776 3676
3777 if (!ret) 3677 if (!ret)
3778 break; 3678 break;
3779 3679
3780 mem_cgroup_reclaim(memcg, GFP_KERNEL, 3680 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3781 MEM_CGROUP_RECLAIM_NOSWAP | 3681
3782 MEM_CGROUP_RECLAIM_SHRINK);
3783 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3682 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3784 /* Usage is reduced ? */ 3683 /* Usage is reduced ? */
3785 if (curusage >= oldusage) 3684 if (curusage >= oldusage)
@@ -4028,8 +3927,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4028 if (signal_pending(current)) 3927 if (signal_pending(current))
4029 return -EINTR; 3928 return -EINTR;
4030 3929
4031 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 3930 progress = try_to_free_mem_cgroup_pages(memcg, 1,
4032 false); 3931 GFP_KERNEL, true);
4033 if (!progress) { 3932 if (!progress) {
4034 nr_retries--; 3933 nr_retries--;
4035 /* maybe some writeback is necessary */ 3934 /* maybe some writeback is necessary */
@@ -4093,6 +3992,46 @@ out:
4093 return retval; 3992 return retval;
4094} 3993}
4095 3994
3995static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3996 enum mem_cgroup_stat_index idx)
3997{
3998 struct mem_cgroup *iter;
3999 long val = 0;
4000
4001 /* Per-cpu values can be negative, use a signed accumulator */
4002 for_each_mem_cgroup_tree(iter, memcg)
4003 val += mem_cgroup_read_stat(iter, idx);
4004
4005 if (val < 0) /* race ? */
4006 val = 0;
4007 return val;
4008}
4009
4010static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4011{
4012 u64 val;
4013
4014 if (!mem_cgroup_is_root(memcg)) {
4015 if (!swap)
4016 return res_counter_read_u64(&memcg->res, RES_USAGE);
4017 else
4018 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4019 }
4020
4021 /*
4022 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4023 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4024 */
4025 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4026 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4027
4028 if (swap)
4029 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4030
4031 return val << PAGE_SHIFT;
4032}
4033
4034
4096static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4035static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4097 struct cftype *cft) 4036 struct cftype *cft)
4098{ 4037{
@@ -4102,8 +4041,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4102 4041
4103 switch (type) { 4042 switch (type) {
4104 case _MEM: 4043 case _MEM:
4044 if (name == RES_USAGE)
4045 return mem_cgroup_usage(memcg, false);
4105 return res_counter_read_u64(&memcg->res, name); 4046 return res_counter_read_u64(&memcg->res, name);
4106 case _MEMSWAP: 4047 case _MEMSWAP:
4048 if (name == RES_USAGE)
4049 return mem_cgroup_usage(memcg, true);
4107 return res_counter_read_u64(&memcg->memsw, name); 4050 return res_counter_read_u64(&memcg->memsw, name);
4108 case _KMEM: 4051 case _KMEM:
4109 return res_counter_read_u64(&memcg->kmem, name); 4052 return res_counter_read_u64(&memcg->kmem, name);
@@ -4150,23 +4093,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4150 if (err) 4093 if (err)
4151 goto out; 4094 goto out;
4152 4095
4153 memcg_id = ida_simple_get(&kmem_limited_groups, 4096 memcg_id = memcg_alloc_cache_id();
4154 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
4155 if (memcg_id < 0) { 4097 if (memcg_id < 0) {
4156 err = memcg_id; 4098 err = memcg_id;
4157 goto out; 4099 goto out;
4158 } 4100 }
4159 4101
4160 /*
4161 * Make sure we have enough space for this cgroup in each root cache's
4162 * memcg_params.
4163 */
4164 mutex_lock(&memcg_slab_mutex);
4165 err = memcg_update_all_caches(memcg_id + 1);
4166 mutex_unlock(&memcg_slab_mutex);
4167 if (err)
4168 goto out_rmid;
4169
4170 memcg->kmemcg_id = memcg_id; 4102 memcg->kmemcg_id = memcg_id;
4171 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4103 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4172 4104
@@ -4187,10 +4119,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4187out: 4119out:
4188 memcg_resume_kmem_account(); 4120 memcg_resume_kmem_account();
4189 return err; 4121 return err;
4190
4191out_rmid:
4192 ida_simple_remove(&kmem_limited_groups, memcg_id);
4193 goto out;
4194} 4122}
4195 4123
4196static int memcg_activate_kmem(struct mem_cgroup *memcg, 4124static int memcg_activate_kmem(struct mem_cgroup *memcg,
@@ -4572,10 +4500,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4572 if (!t) 4500 if (!t)
4573 goto unlock; 4501 goto unlock;
4574 4502
4575 if (!swap) 4503 usage = mem_cgroup_usage(memcg, swap);
4576 usage = res_counter_read_u64(&memcg->res, RES_USAGE);
4577 else
4578 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4579 4504
4580 /* 4505 /*
4581 * current_threshold points to threshold just below or equal to usage. 4506 * current_threshold points to threshold just below or equal to usage.
@@ -4673,10 +4598,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4673 4598
4674 if (type == _MEM) { 4599 if (type == _MEM) {
4675 thresholds = &memcg->thresholds; 4600 thresholds = &memcg->thresholds;
4676 usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4601 usage = mem_cgroup_usage(memcg, false);
4677 } else if (type == _MEMSWAP) { 4602 } else if (type == _MEMSWAP) {
4678 thresholds = &memcg->memsw_thresholds; 4603 thresholds = &memcg->memsw_thresholds;
4679 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4604 usage = mem_cgroup_usage(memcg, true);
4680 } else 4605 } else
4681 BUG(); 4606 BUG();
4682 4607
@@ -4762,10 +4687,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4762 4687
4763 if (type == _MEM) { 4688 if (type == _MEM) {
4764 thresholds = &memcg->thresholds; 4689 thresholds = &memcg->thresholds;
4765 usage = res_counter_read_u64(&memcg->res, RES_USAGE); 4690 usage = mem_cgroup_usage(memcg, false);
4766 } else if (type == _MEMSWAP) { 4691 } else if (type == _MEMSWAP) {
4767 thresholds = &memcg->memsw_thresholds; 4692 thresholds = &memcg->memsw_thresholds;
4768 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4693 usage = mem_cgroup_usage(memcg, true);
4769 } else 4694 } else
4770 BUG(); 4695 BUG();
4771 4696
@@ -5502,6 +5427,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5502{ 5427{
5503 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5428 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5504 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 5429 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
5430 int ret;
5505 5431
5506 if (css->id > MEM_CGROUP_ID_MAX) 5432 if (css->id > MEM_CGROUP_ID_MAX)
5507 return -ENOSPC; 5433 return -ENOSPC;
@@ -5525,9 +5451,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5525 * core guarantees its existence. 5451 * core guarantees its existence.
5526 */ 5452 */
5527 } else { 5453 } else {
5528 res_counter_init(&memcg->res, &root_mem_cgroup->res); 5454 res_counter_init(&memcg->res, NULL);
5529 res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); 5455 res_counter_init(&memcg->memsw, NULL);
5530 res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); 5456 res_counter_init(&memcg->kmem, NULL);
5531 /* 5457 /*
5532 * Deeper hierachy with use_hierarchy == false doesn't make 5458 * Deeper hierachy with use_hierarchy == false doesn't make
5533 * much sense so let cgroup subsystem know about this 5459 * much sense so let cgroup subsystem know about this
@@ -5538,7 +5464,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5538 } 5464 }
5539 mutex_unlock(&memcg_create_mutex); 5465 mutex_unlock(&memcg_create_mutex);
5540 5466
5541 return memcg_init_kmem(memcg, &memory_cgrp_subsys); 5467 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
5468 if (ret)
5469 return ret;
5470
5471 /*
5472 * Make sure the memcg is initialized: mem_cgroup_iter()
5473 * orders reading memcg->initialized against its callers
5474 * reading the memcg members.
5475 */
5476 smp_store_release(&memcg->initialized, 1);
5477
5478 return 0;
5542} 5479}
5543 5480
5544/* 5481/*
@@ -5969,8 +5906,9 @@ static void __mem_cgroup_clear_mc(void)
5969 /* we must fixup refcnts and charges */ 5906 /* we must fixup refcnts and charges */
5970 if (mc.moved_swap) { 5907 if (mc.moved_swap) {
5971 /* uncharge swap account from the old cgroup */ 5908 /* uncharge swap account from the old cgroup */
5972 res_counter_uncharge(&mc.from->memsw, 5909 if (!mem_cgroup_is_root(mc.from))
5973 PAGE_SIZE * mc.moved_swap); 5910 res_counter_uncharge(&mc.from->memsw,
5911 PAGE_SIZE * mc.moved_swap);
5974 5912
5975 for (i = 0; i < mc.moved_swap; i++) 5913 for (i = 0; i < mc.moved_swap; i++)
5976 css_put(&mc.from->css); 5914 css_put(&mc.from->css);
@@ -5979,8 +5917,9 @@ static void __mem_cgroup_clear_mc(void)
5979 * we charged both to->res and to->memsw, so we should 5917 * we charged both to->res and to->memsw, so we should
5980 * uncharge to->res. 5918 * uncharge to->res.
5981 */ 5919 */
5982 res_counter_uncharge(&mc.to->res, 5920 if (!mem_cgroup_is_root(mc.to))
5983 PAGE_SIZE * mc.moved_swap); 5921 res_counter_uncharge(&mc.to->res,
5922 PAGE_SIZE * mc.moved_swap);
5984 /* we've already done css_get(mc.to) */ 5923 /* we've already done css_get(mc.to) */
5985 mc.moved_swap = 0; 5924 mc.moved_swap = 0;
5986 } 5925 }
@@ -6345,7 +6284,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
6345 rcu_read_lock(); 6284 rcu_read_lock();
6346 memcg = mem_cgroup_lookup(id); 6285 memcg = mem_cgroup_lookup(id);
6347 if (memcg) { 6286 if (memcg) {
6348 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6287 if (!mem_cgroup_is_root(memcg))
6288 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
6349 mem_cgroup_swap_statistics(memcg, false); 6289 mem_cgroup_swap_statistics(memcg, false);
6350 css_put(&memcg->css); 6290 css_put(&memcg->css);
6351 } 6291 }
@@ -6509,12 +6449,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6509{ 6449{
6510 unsigned long flags; 6450 unsigned long flags;
6511 6451
6512 if (nr_mem) 6452 if (!mem_cgroup_is_root(memcg)) {
6513 res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); 6453 if (nr_mem)
6514 if (nr_memsw) 6454 res_counter_uncharge(&memcg->res,
6515 res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); 6455 nr_mem * PAGE_SIZE);
6516 6456 if (nr_memsw)
6517 memcg_oom_recover(memcg); 6457 res_counter_uncharge(&memcg->memsw,
6458 nr_memsw * PAGE_SIZE);
6459 memcg_oom_recover(memcg);
6460 }
6518 6461
6519 local_irq_save(flags); 6462 local_irq_save(flags);
6520 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 6463 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 44c6bd201d3a..8639f6b28746 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -148,7 +148,7 @@ static int hwpoison_filter_task(struct page *p)
148 ino = cgroup_ino(css->cgroup); 148 ino = cgroup_ino(css->cgroup);
149 css_put(css); 149 css_put(css);
150 150
151 if (!ino || ino != hwpoison_filter_memcg) 151 if (ino != hwpoison_filter_memcg)
152 return -EINVAL; 152 return -EINVAL;
153 153
154 return 0; 154 return 0;
diff --git a/mm/memory.c b/mm/memory.c
index ab3537bcfed2..e229970e4223 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,8 @@ __setup("norandmaps", disable_randmaps);
118unsigned long zero_pfn __read_mostly; 118unsigned long zero_pfn __read_mostly;
119unsigned long highest_memmap_pfn __read_mostly; 119unsigned long highest_memmap_pfn __read_mostly;
120 120
121EXPORT_SYMBOL(zero_pfn);
122
121/* 123/*
122 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 124 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
123 */ 125 */
@@ -751,7 +753,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
751 unsigned long pfn = pte_pfn(pte); 753 unsigned long pfn = pte_pfn(pte);
752 754
753 if (HAVE_PTE_SPECIAL) { 755 if (HAVE_PTE_SPECIAL) {
754 if (likely(!pte_special(pte) || pte_numa(pte))) 756 if (likely(!pte_special(pte)))
755 goto check_pfn; 757 goto check_pfn;
756 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 758 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
757 return NULL; 759 return NULL;
@@ -777,15 +779,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
777 } 779 }
778 } 780 }
779 781
782 if (is_zero_pfn(pfn))
783 return NULL;
780check_pfn: 784check_pfn:
781 if (unlikely(pfn > highest_memmap_pfn)) { 785 if (unlikely(pfn > highest_memmap_pfn)) {
782 print_bad_pte(vma, addr, pte, NULL); 786 print_bad_pte(vma, addr, pte, NULL);
783 return NULL; 787 return NULL;
784 } 788 }
785 789
786 if (is_zero_pfn(pfn))
787 return NULL;
788
789 /* 790 /*
790 * NOTE! We still have PageReserved() pages in the page tables. 791 * NOTE! We still have PageReserved() pages in the page tables.
791 * eg. VDSO mappings can cause them to exist. 792 * eg. VDSO mappings can cause them to exist.
@@ -1126,7 +1127,7 @@ again:
1126 addr) != page->index) { 1127 addr) != page->index) {
1127 pte_t ptfile = pgoff_to_pte(page->index); 1128 pte_t ptfile = pgoff_to_pte(page->index);
1128 if (pte_soft_dirty(ptent)) 1129 if (pte_soft_dirty(ptent))
1129 pte_file_mksoft_dirty(ptfile); 1130 ptfile = pte_file_mksoft_dirty(ptfile);
1130 set_pte_at(mm, addr, pte, ptfile); 1131 set_pte_at(mm, addr, pte, ptfile);
1131 } 1132 }
1132 if (PageAnon(page)) 1133 if (PageAnon(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2ff8c2325e96..29d8693d0c61 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1307/* 1307/*
1308 * Confirm all pages in a range [start, end) is belongs to the same zone. 1308 * Confirm all pages in a range [start, end) is belongs to the same zone.
1309 */ 1309 */
1310static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1310int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
1311{ 1311{
1312 unsigned long pfn; 1312 unsigned long pfn;
1313 struct zone *zone = NULL; 1313 struct zone *zone = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8f5330d74f47..e58725aff7e9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,25 +123,23 @@ static struct mempolicy default_policy = {
123 123
124static struct mempolicy preferred_node_policy[MAX_NUMNODES]; 124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125 125
126static struct mempolicy *get_task_policy(struct task_struct *p) 126struct mempolicy *get_task_policy(struct task_struct *p)
127{ 127{
128 struct mempolicy *pol = p->mempolicy; 128 struct mempolicy *pol = p->mempolicy;
129 int node;
129 130
130 if (!pol) { 131 if (pol)
131 int node = numa_node_id(); 132 return pol;
132 133
133 if (node != NUMA_NO_NODE) { 134 node = numa_node_id();
134 pol = &preferred_node_policy[node]; 135 if (node != NUMA_NO_NODE) {
135 /* 136 pol = &preferred_node_policy[node];
136 * preferred_node_policy is not initialised early in 137 /* preferred_node_policy is not initialised early in boot */
137 * boot 138 if (pol->mode)
138 */ 139 return pol;
139 if (!pol->mode)
140 pol = NULL;
141 }
142 } 140 }
143 141
144 return pol; 142 return &default_policy;
145} 143}
146 144
147static const struct mempolicy_operations { 145static const struct mempolicy_operations {
@@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
683 } 681 }
684 682
685 if (flags & MPOL_MF_LAZY) { 683 if (flags & MPOL_MF_LAZY) {
686 change_prot_numa(vma, start, endvma); 684 /* Similar to task_numa_work, skip inaccessible VMAs */
685 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
686 change_prot_numa(vma, start, endvma);
687 goto next; 687 goto next;
688 } 688 }
689 689
@@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
804 nodemask_t *nodes) 804 nodemask_t *nodes)
805{ 805{
806 struct mempolicy *new, *old; 806 struct mempolicy *new, *old;
807 struct mm_struct *mm = current->mm;
808 NODEMASK_SCRATCH(scratch); 807 NODEMASK_SCRATCH(scratch);
809 int ret; 808 int ret;
810 809
@@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
816 ret = PTR_ERR(new); 815 ret = PTR_ERR(new);
817 goto out; 816 goto out;
818 } 817 }
819 /* 818
820 * prevent changing our mempolicy while show_numa_maps()
821 * is using it.
822 * Note: do_set_mempolicy() can be called at init time
823 * with no 'mm'.
824 */
825 if (mm)
826 down_write(&mm->mmap_sem);
827 task_lock(current); 819 task_lock(current);
828 ret = mpol_set_nodemask(new, nodes, scratch); 820 ret = mpol_set_nodemask(new, nodes, scratch);
829 if (ret) { 821 if (ret) {
830 task_unlock(current); 822 task_unlock(current);
831 if (mm)
832 up_write(&mm->mmap_sem);
833 mpol_put(new); 823 mpol_put(new);
834 goto out; 824 goto out;
835 } 825 }
@@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
839 nodes_weight(new->v.nodes)) 829 nodes_weight(new->v.nodes))
840 current->il_next = first_node(new->v.nodes); 830 current->il_next = first_node(new->v.nodes);
841 task_unlock(current); 831 task_unlock(current);
842 if (mm)
843 up_write(&mm->mmap_sem);
844
845 mpol_put(old); 832 mpol_put(old);
846 ret = 0; 833 ret = 0;
847out: 834out:
@@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1605 1592
1606#endif 1593#endif
1607 1594
1608/* 1595struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1609 * get_vma_policy(@task, @vma, @addr) 1596 unsigned long addr)
1610 * @task: task for fallback if vma policy == default
1611 * @vma: virtual memory area whose policy is sought
1612 * @addr: address in @vma for shared policy lookup
1613 *
1614 * Returns effective policy for a VMA at specified address.
1615 * Falls back to @task or system default policy, as necessary.
1616 * Current or other task's task mempolicy and non-shared vma policies must be
1617 * protected by task_lock(task) by the caller.
1618 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1619 * count--added by the get_policy() vm_op, as appropriate--to protect against
1620 * freeing by another task. It is the caller's responsibility to free the
1621 * extra reference for shared policies.
1622 */
1623struct mempolicy *get_vma_policy(struct task_struct *task,
1624 struct vm_area_struct *vma, unsigned long addr)
1625{ 1597{
1626 struct mempolicy *pol = get_task_policy(task); 1598 struct mempolicy *pol = NULL;
1627 1599
1628 if (vma) { 1600 if (vma) {
1629 if (vma->vm_ops && vma->vm_ops->get_policy) { 1601 if (vma->vm_ops && vma->vm_ops->get_policy) {
1630 struct mempolicy *vpol = vma->vm_ops->get_policy(vma, 1602 pol = vma->vm_ops->get_policy(vma, addr);
1631 addr);
1632 if (vpol)
1633 pol = vpol;
1634 } else if (vma->vm_policy) { 1603 } else if (vma->vm_policy) {
1635 pol = vma->vm_policy; 1604 pol = vma->vm_policy;
1636 1605
@@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1644 mpol_get(pol); 1613 mpol_get(pol);
1645 } 1614 }
1646 } 1615 }
1616
1617 return pol;
1618}
1619
1620/*
1621 * get_vma_policy(@vma, @addr)
1622 * @vma: virtual memory area whose policy is sought
1623 * @addr: address in @vma for shared policy lookup
1624 *
1625 * Returns effective policy for a VMA at specified address.
1626 * Falls back to current->mempolicy or system default policy, as necessary.
1627 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1628 * count--added by the get_policy() vm_op, as appropriate--to protect against
1629 * freeing by another task. It is the caller's responsibility to free the
1630 * extra reference for shared policies.
1631 */
1632static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1633 unsigned long addr)
1634{
1635 struct mempolicy *pol = __get_vma_policy(vma, addr);
1636
1647 if (!pol) 1637 if (!pol)
1648 pol = &default_policy; 1638 pol = get_task_policy(current);
1639
1649 return pol; 1640 return pol;
1650} 1641}
1651 1642
1652bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) 1643bool vma_policy_mof(struct vm_area_struct *vma)
1653{ 1644{
1654 struct mempolicy *pol = get_task_policy(task); 1645 struct mempolicy *pol;
1655 if (vma) {
1656 if (vma->vm_ops && vma->vm_ops->get_policy) {
1657 bool ret = false;
1658 1646
1659 pol = vma->vm_ops->get_policy(vma, vma->vm_start); 1647 if (vma->vm_ops && vma->vm_ops->get_policy) {
1660 if (pol && (pol->flags & MPOL_F_MOF)) 1648 bool ret = false;
1661 ret = true;
1662 mpol_cond_put(pol);
1663 1649
1664 return ret; 1650 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1665 } else if (vma->vm_policy) { 1651 if (pol && (pol->flags & MPOL_F_MOF))
1666 pol = vma->vm_policy; 1652 ret = true;
1667 } 1653 mpol_cond_put(pol);
1654
1655 return ret;
1668 } 1656 }
1669 1657
1658 pol = vma->vm_policy;
1670 if (!pol) 1659 if (!pol)
1671 return default_policy.flags & MPOL_F_MOF; 1660 pol = get_task_policy(current);
1672 1661
1673 return pol->flags & MPOL_F_MOF; 1662 return pol->flags & MPOL_F_MOF;
1674} 1663}
@@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1874{ 1863{
1875 struct zonelist *zl; 1864 struct zonelist *zl;
1876 1865
1877 *mpol = get_vma_policy(current, vma, addr); 1866 *mpol = get_vma_policy(vma, addr);
1878 *nodemask = NULL; /* assume !MPOL_BIND */ 1867 *nodemask = NULL; /* assume !MPOL_BIND */
1879 1868
1880 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1869 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
@@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2029 unsigned int cpuset_mems_cookie; 2018 unsigned int cpuset_mems_cookie;
2030 2019
2031retry_cpuset: 2020retry_cpuset:
2032 pol = get_vma_policy(current, vma, addr); 2021 pol = get_vma_policy(vma, addr);
2033 cpuset_mems_cookie = read_mems_allowed_begin(); 2022 cpuset_mems_cookie = read_mems_allowed_begin();
2034 2023
2035 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 2024 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
@@ -2046,8 +2035,7 @@ retry_cpuset:
2046 page = __alloc_pages_nodemask(gfp, order, 2035 page = __alloc_pages_nodemask(gfp, order,
2047 policy_zonelist(gfp, pol, node), 2036 policy_zonelist(gfp, pol, node),
2048 policy_nodemask(gfp, pol)); 2037 policy_nodemask(gfp, pol));
2049 if (unlikely(mpol_needs_cond_ref(pol))) 2038 mpol_cond_put(pol);
2050 __mpol_put(pol);
2051 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2039 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2052 goto retry_cpuset; 2040 goto retry_cpuset;
2053 return page; 2041 return page;
@@ -2074,12 +2062,12 @@ retry_cpuset:
2074 */ 2062 */
2075struct page *alloc_pages_current(gfp_t gfp, unsigned order) 2063struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2076{ 2064{
2077 struct mempolicy *pol = get_task_policy(current); 2065 struct mempolicy *pol = &default_policy;
2078 struct page *page; 2066 struct page *page;
2079 unsigned int cpuset_mems_cookie; 2067 unsigned int cpuset_mems_cookie;
2080 2068
2081 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 2069 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2082 pol = &default_policy; 2070 pol = get_task_policy(current);
2083 2071
2084retry_cpuset: 2072retry_cpuset:
2085 cpuset_mems_cookie = read_mems_allowed_begin(); 2073 cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2296 2284
2297 BUG_ON(!vma); 2285 BUG_ON(!vma);
2298 2286
2299 pol = get_vma_policy(current, vma, addr); 2287 pol = get_vma_policy(vma, addr);
2300 if (!(pol->flags & MPOL_F_MOF)) 2288 if (!(pol->flags & MPOL_F_MOF))
2301 goto out; 2289 goto out;
2302 2290
diff --git a/mm/migrate.c b/mm/migrate.c
index f78ec9bd454d..01439953abf5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -146,8 +146,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
146 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 146 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
147 if (pte_swp_soft_dirty(*ptep)) 147 if (pte_swp_soft_dirty(*ptep))
148 pte = pte_mksoft_dirty(pte); 148 pte = pte_mksoft_dirty(pte);
149
150 /* Recheck VMA as permissions can change since migration started */
149 if (is_write_migration_entry(entry)) 151 if (is_write_migration_entry(entry))
150 pte = pte_mkwrite(pte); 152 pte = maybe_mkwrite(pte, vma);
153
151#ifdef CONFIG_HUGETLB_PAGE 154#ifdef CONFIG_HUGETLB_PAGE
152 if (PageHuge(new)) { 155 if (PageHuge(new)) {
153 pte = pte_mkhuge(pte); 156 pte = pte_mkhuge(pte);
@@ -873,7 +876,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
873 } 876 }
874 } 877 }
875 878
876 if (unlikely(balloon_page_movable(page))) { 879 if (unlikely(isolated_balloon_page(page))) {
877 /* 880 /*
878 * A ballooned page does not need any special attention from 881 * A ballooned page does not need any special attention from
879 * physical to virtual reverse mapping procedures. 882 * physical to virtual reverse mapping procedures.
@@ -952,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
952 955
953 rc = __unmap_and_move(page, newpage, force, mode); 956 rc = __unmap_and_move(page, newpage, force, mode);
954 957
955 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
956 /*
957 * A ballooned page has been migrated already.
958 * Now, it's the time to wrap-up counters,
959 * handle the page back to Buddy and return.
960 */
961 dec_zone_page_state(page, NR_ISOLATED_ANON +
962 page_is_file_cache(page));
963 balloon_page_free(page);
964 return MIGRATEPAGE_SUCCESS;
965 }
966out: 958out:
967 if (rc != -EAGAIN) { 959 if (rc != -EAGAIN) {
968 /* 960 /*
@@ -985,6 +977,9 @@ out:
985 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { 977 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
986 ClearPageSwapBacked(newpage); 978 ClearPageSwapBacked(newpage);
987 put_new_page(newpage, private); 979 put_new_page(newpage, private);
980 } else if (unlikely(__is_movable_balloon_page(newpage))) {
981 /* drop our reference, page already in the balloon */
982 put_page(newpage);
988 } else 983 } else
989 putback_lru_page(newpage); 984 putback_lru_page(newpage);
990 985
diff --git a/mm/mlock.c b/mm/mlock.c
index ce84cb0b83ef..03aa8512723b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -233,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
233 233
234 VM_BUG_ON(start & ~PAGE_MASK); 234 VM_BUG_ON(start & ~PAGE_MASK);
235 VM_BUG_ON(end & ~PAGE_MASK); 235 VM_BUG_ON(end & ~PAGE_MASK);
236 VM_BUG_ON(start < vma->vm_start); 236 VM_BUG_ON_VMA(start < vma->vm_start, vma);
237 VM_BUG_ON(end > vma->vm_end); 237 VM_BUG_ON_VMA(end > vma->vm_end, vma);
238 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 238 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
239 239
240 gup_flags = FOLL_TOUCH | FOLL_MLOCK; 240 gup_flags = FOLL_TOUCH | FOLL_MLOCK;
241 /* 241 /*
diff --git a/mm/mmap.c b/mm/mmap.c
index c1f2ea4a0b99..93d28c7e5420 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm,
70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 70 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
71 * w: (no) no w: (no) no w: (yes) yes w: (no) no 71 * w: (no) no w: (no) no w: (yes) yes w: (no) no
72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 72 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
73 * 73 *
74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 74 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
75 * w: (no) no w: (no) no w: (copy) copy w: (no) no 75 * w: (no) no w: (no) no w: (copy) copy w: (no) no
76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 76 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
@@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len);
268 268
269SYSCALL_DEFINE1(brk, unsigned long, brk) 269SYSCALL_DEFINE1(brk, unsigned long, brk)
270{ 270{
271 unsigned long rlim, retval; 271 unsigned long retval;
272 unsigned long newbrk, oldbrk; 272 unsigned long newbrk, oldbrk;
273 struct mm_struct *mm = current->mm; 273 struct mm_struct *mm = current->mm;
274 unsigned long min_brk; 274 unsigned long min_brk;
@@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
298 * segment grow beyond its set limit the in case where the limit is 298 * segment grow beyond its set limit the in case where the limit is
299 * not page aligned -Ram Gupta 299 * not page aligned -Ram Gupta
300 */ 300 */
301 rlim = rlimit(RLIMIT_DATA); 301 if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
302 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 302 mm->end_data, mm->start_data))
303 (mm->end_data - mm->start_data) > rlim)
304 goto out; 303 goto out;
305 304
306 newbrk = PAGE_ALIGN(brk); 305 newbrk = PAGE_ALIGN(brk);
@@ -369,20 +368,22 @@ static int browse_rb(struct rb_root *root)
369 struct vm_area_struct *vma; 368 struct vm_area_struct *vma;
370 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 369 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
371 if (vma->vm_start < prev) { 370 if (vma->vm_start < prev) {
372 pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); 371 pr_emerg("vm_start %lx < prev %lx\n",
372 vma->vm_start, prev);
373 bug = 1; 373 bug = 1;
374 } 374 }
375 if (vma->vm_start < pend) { 375 if (vma->vm_start < pend) {
376 pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); 376 pr_emerg("vm_start %lx < pend %lx\n",
377 vma->vm_start, pend);
377 bug = 1; 378 bug = 1;
378 } 379 }
379 if (vma->vm_start > vma->vm_end) { 380 if (vma->vm_start > vma->vm_end) {
380 pr_info("vm_end %lx < vm_start %lx\n", 381 pr_emerg("vm_start %lx > vm_end %lx\n",
381 vma->vm_end, vma->vm_start); 382 vma->vm_start, vma->vm_end);
382 bug = 1; 383 bug = 1;
383 } 384 }
384 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 385 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
385 pr_info("free gap %lx, correct %lx\n", 386 pr_emerg("free gap %lx, correct %lx\n",
386 vma->rb_subtree_gap, 387 vma->rb_subtree_gap,
387 vma_compute_subtree_gap(vma)); 388 vma_compute_subtree_gap(vma));
388 bug = 1; 389 bug = 1;
@@ -396,7 +397,7 @@ static int browse_rb(struct rb_root *root)
396 for (nd = pn; nd; nd = rb_prev(nd)) 397 for (nd = pn; nd; nd = rb_prev(nd))
397 j++; 398 j++;
398 if (i != j) { 399 if (i != j) {
399 pr_info("backwards %d, forwards %d\n", j, i); 400 pr_emerg("backwards %d, forwards %d\n", j, i);
400 bug = 1; 401 bug = 1;
401 } 402 }
402 return bug ? -1 : i; 403 return bug ? -1 : i;
@@ -409,8 +410,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
409 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 410 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
410 struct vm_area_struct *vma; 411 struct vm_area_struct *vma;
411 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 412 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
412 BUG_ON(vma != ignore && 413 VM_BUG_ON_VMA(vma != ignore &&
413 vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); 414 vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
415 vma);
414 } 416 }
415} 417}
416 418
@@ -420,8 +422,10 @@ static void validate_mm(struct mm_struct *mm)
420 int i = 0; 422 int i = 0;
421 unsigned long highest_address = 0; 423 unsigned long highest_address = 0;
422 struct vm_area_struct *vma = mm->mmap; 424 struct vm_area_struct *vma = mm->mmap;
425
423 while (vma) { 426 while (vma) {
424 struct anon_vma_chain *avc; 427 struct anon_vma_chain *avc;
428
425 vma_lock_anon_vma(vma); 429 vma_lock_anon_vma(vma);
426 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 430 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
427 anon_vma_interval_tree_verify(avc); 431 anon_vma_interval_tree_verify(avc);
@@ -431,20 +435,21 @@ static void validate_mm(struct mm_struct *mm)
431 i++; 435 i++;
432 } 436 }
433 if (i != mm->map_count) { 437 if (i != mm->map_count) {
434 pr_info("map_count %d vm_next %d\n", mm->map_count, i); 438 pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
435 bug = 1; 439 bug = 1;
436 } 440 }
437 if (highest_address != mm->highest_vm_end) { 441 if (highest_address != mm->highest_vm_end) {
438 pr_info("mm->highest_vm_end %lx, found %lx\n", 442 pr_emerg("mm->highest_vm_end %lx, found %lx\n",
439 mm->highest_vm_end, highest_address); 443 mm->highest_vm_end, highest_address);
440 bug = 1; 444 bug = 1;
441 } 445 }
442 i = browse_rb(&mm->mm_rb); 446 i = browse_rb(&mm->mm_rb);
443 if (i != mm->map_count) { 447 if (i != mm->map_count) {
444 pr_info("map_count %d rb %d\n", mm->map_count, i); 448 if (i != -1)
449 pr_emerg("map_count %d rb %d\n", mm->map_count, i);
445 bug = 1; 450 bug = 1;
446 } 451 }
447 BUG_ON(bug); 452 VM_BUG_ON_MM(bug, mm);
448} 453}
449#else 454#else
450#define validate_mm_rb(root, ignore) do { } while (0) 455#define validate_mm_rb(root, ignore) do { } while (0)
@@ -741,7 +746,7 @@ again: remove_next = 1 + (end > next->vm_end);
741 * split_vma inserting another: so it must be 746 * split_vma inserting another: so it must be
742 * mprotect case 4 shifting the boundary down. 747 * mprotect case 4 shifting the boundary down.
743 */ 748 */
744 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 749 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
745 exporter = vma; 750 exporter = vma;
746 importer = next; 751 importer = next;
747 } 752 }
@@ -787,8 +792,8 @@ again: remove_next = 1 + (end > next->vm_end);
787 if (!anon_vma && adjust_next) 792 if (!anon_vma && adjust_next)
788 anon_vma = next->anon_vma; 793 anon_vma = next->anon_vma;
789 if (anon_vma) { 794 if (anon_vma) {
790 VM_BUG_ON(adjust_next && next->anon_vma && 795 VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
791 anon_vma != next->anon_vma); 796 anon_vma != next->anon_vma, next);
792 anon_vma_lock_write(anon_vma); 797 anon_vma_lock_write(anon_vma);
793 anon_vma_interval_tree_pre_update_vma(vma); 798 anon_vma_interval_tree_pre_update_vma(vma);
794 if (adjust_next) 799 if (adjust_next)
@@ -1010,7 +1015,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
1010struct vm_area_struct *vma_merge(struct mm_struct *mm, 1015struct vm_area_struct *vma_merge(struct mm_struct *mm,
1011 struct vm_area_struct *prev, unsigned long addr, 1016 struct vm_area_struct *prev, unsigned long addr,
1012 unsigned long end, unsigned long vm_flags, 1017 unsigned long end, unsigned long vm_flags,
1013 struct anon_vma *anon_vma, struct file *file, 1018 struct anon_vma *anon_vma, struct file *file,
1014 pgoff_t pgoff, struct mempolicy *policy) 1019 pgoff_t pgoff, struct mempolicy *policy)
1015{ 1020{
1016 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 1021 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
@@ -1036,7 +1041,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1036 * Can it merge with the predecessor? 1041 * Can it merge with the predecessor?
1037 */ 1042 */
1038 if (prev && prev->vm_end == addr && 1043 if (prev && prev->vm_end == addr &&
1039 mpol_equal(vma_policy(prev), policy) && 1044 mpol_equal(vma_policy(prev), policy) &&
1040 can_vma_merge_after(prev, vm_flags, 1045 can_vma_merge_after(prev, vm_flags,
1041 anon_vma, file, pgoff)) { 1046 anon_vma, file, pgoff)) {
1042 /* 1047 /*
@@ -1064,7 +1069,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1064 * Can this new request be merged in front of next? 1069 * Can this new request be merged in front of next?
1065 */ 1070 */
1066 if (next && end == next->vm_start && 1071 if (next && end == next->vm_start &&
1067 mpol_equal(policy, vma_policy(next)) && 1072 mpol_equal(policy, vma_policy(next)) &&
1068 can_vma_merge_before(next, vm_flags, 1073 can_vma_merge_before(next, vm_flags,
1069 anon_vma, file, pgoff+pglen)) { 1074 anon_vma, file, pgoff+pglen)) {
1070 if (prev && addr < prev->vm_end) /* case 4 */ 1075 if (prev && addr < prev->vm_end) /* case 4 */
@@ -1235,7 +1240,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1235 unsigned long flags, unsigned long pgoff, 1240 unsigned long flags, unsigned long pgoff,
1236 unsigned long *populate) 1241 unsigned long *populate)
1237{ 1242{
1238 struct mm_struct * mm = current->mm; 1243 struct mm_struct *mm = current->mm;
1239 vm_flags_t vm_flags; 1244 vm_flags_t vm_flags;
1240 1245
1241 *populate = 0; 1246 *populate = 0;
@@ -1263,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1263 1268
1264 /* offset overflow? */ 1269 /* offset overflow? */
1265 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 1270 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1266 return -EOVERFLOW; 1271 return -EOVERFLOW;
1267 1272
1268 /* Too many mappings? */ 1273 /* Too many mappings? */
1269 if (mm->map_count > sysctl_max_map_count) 1274 if (mm->map_count > sysctl_max_map_count)
@@ -1921,7 +1926,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1921 info.align_mask = 0; 1926 info.align_mask = 0;
1922 return vm_unmapped_area(&info); 1927 return vm_unmapped_area(&info);
1923} 1928}
1924#endif 1929#endif
1925 1930
1926/* 1931/*
1927 * This mmap-allocator allocates new areas top-down from below the 1932 * This mmap-allocator allocates new areas top-down from below the
@@ -2321,13 +2326,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
2321} 2326}
2322 2327
2323struct vm_area_struct * 2328struct vm_area_struct *
2324find_extend_vma(struct mm_struct * mm, unsigned long addr) 2329find_extend_vma(struct mm_struct *mm, unsigned long addr)
2325{ 2330{
2326 struct vm_area_struct * vma; 2331 struct vm_area_struct *vma;
2327 unsigned long start; 2332 unsigned long start;
2328 2333
2329 addr &= PAGE_MASK; 2334 addr &= PAGE_MASK;
2330 vma = find_vma(mm,addr); 2335 vma = find_vma(mm, addr);
2331 if (!vma) 2336 if (!vma)
2332 return NULL; 2337 return NULL;
2333 if (vma->vm_start <= addr) 2338 if (vma->vm_start <= addr)
@@ -2376,7 +2381,7 @@ static void unmap_region(struct mm_struct *mm,
2376 struct vm_area_struct *vma, struct vm_area_struct *prev, 2381 struct vm_area_struct *vma, struct vm_area_struct *prev,
2377 unsigned long start, unsigned long end) 2382 unsigned long start, unsigned long end)
2378{ 2383{
2379 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 2384 struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
2380 struct mmu_gather tlb; 2385 struct mmu_gather tlb;
2381 2386
2382 lru_add_drain(); 2387 lru_add_drain();
@@ -2423,7 +2428,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2423 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the 2428 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
2424 * munmap path where it doesn't make sense to fail. 2429 * munmap path where it doesn't make sense to fail.
2425 */ 2430 */
2426static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 2431static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2427 unsigned long addr, int new_below) 2432 unsigned long addr, int new_below)
2428{ 2433{
2429 struct vm_area_struct *new; 2434 struct vm_area_struct *new;
@@ -2512,7 +2517,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2512 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 2517 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
2513 return -EINVAL; 2518 return -EINVAL;
2514 2519
2515 if ((len = PAGE_ALIGN(len)) == 0) 2520 len = PAGE_ALIGN(len);
2521 if (len == 0)
2516 return -EINVAL; 2522 return -EINVAL;
2517 2523
2518 /* Find the first overlapping VMA */ 2524 /* Find the first overlapping VMA */
@@ -2558,7 +2564,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2558 if (error) 2564 if (error)
2559 return error; 2565 return error;
2560 } 2566 }
2561 vma = prev? prev->vm_next: mm->mmap; 2567 vma = prev ? prev->vm_next : mm->mmap;
2562 2568
2563 /* 2569 /*
2564 * unlock any mlock()ed ranges before detaching vmas 2570 * unlock any mlock()ed ranges before detaching vmas
@@ -2621,10 +2627,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
2621 */ 2627 */
2622static unsigned long do_brk(unsigned long addr, unsigned long len) 2628static unsigned long do_brk(unsigned long addr, unsigned long len)
2623{ 2629{
2624 struct mm_struct * mm = current->mm; 2630 struct mm_struct *mm = current->mm;
2625 struct vm_area_struct * vma, * prev; 2631 struct vm_area_struct *vma, *prev;
2626 unsigned long flags; 2632 unsigned long flags;
2627 struct rb_node ** rb_link, * rb_parent; 2633 struct rb_node **rb_link, *rb_parent;
2628 pgoff_t pgoff = addr >> PAGE_SHIFT; 2634 pgoff_t pgoff = addr >> PAGE_SHIFT;
2629 int error; 2635 int error;
2630 2636
@@ -2848,7 +2854,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2848 * safe. It is only safe to keep the vm_pgoff 2854 * safe. It is only safe to keep the vm_pgoff
2849 * linear if there are no pages mapped yet. 2855 * linear if there are no pages mapped yet.
2850 */ 2856 */
2851 VM_BUG_ON(faulted_in_anon_vma); 2857 VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
2852 *vmap = vma = new_vma; 2858 *vmap = vma = new_vma;
2853 } 2859 }
2854 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2860 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
@@ -3196,7 +3202,7 @@ void __init mmap_init(void)
3196{ 3202{
3197 int ret; 3203 int ret;
3198 3204
3199 ret = percpu_counter_init(&vm_committed_as, 0); 3205 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
3200 VM_BUG_ON(ret); 3206 VM_BUG_ON(ret);
3201} 3207}
3202 3208
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 950813b1eb36..2c8da9825fe3 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -107,7 +107,8 @@ void __mmu_notifier_release(struct mm_struct *mm)
107 * existed or not. 107 * existed or not.
108 */ 108 */
109int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 109int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
110 unsigned long address) 110 unsigned long start,
111 unsigned long end)
111{ 112{
112 struct mmu_notifier *mn; 113 struct mmu_notifier *mn;
113 int young = 0, id; 114 int young = 0, id;
@@ -115,7 +116,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
115 id = srcu_read_lock(&srcu); 116 id = srcu_read_lock(&srcu);
116 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { 117 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
117 if (mn->ops->clear_flush_young) 118 if (mn->ops->clear_flush_young)
118 young |= mn->ops->clear_flush_young(mn, mm, address); 119 young |= mn->ops->clear_flush_young(mn, mm, start, end);
119 } 120 }
120 srcu_read_unlock(&srcu, id); 121 srcu_read_unlock(&srcu, id);
121 122
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180e9f21..b147f66f4c40 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -21,8 +21,8 @@
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/mmu_notifier.h> 22#include <linux/mmu_notifier.h>
23#include <linux/sched/sysctl.h> 23#include <linux/sched/sysctl.h>
24#include <linux/uaccess.h>
24 25
25#include <asm/uaccess.h>
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
@@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
195 if (pmd_trans_huge(*old_pmd)) { 195 if (pmd_trans_huge(*old_pmd)) {
196 int err = 0; 196 int err = 0;
197 if (extent == HPAGE_PMD_SIZE) { 197 if (extent == HPAGE_PMD_SIZE) {
198 VM_BUG_ON(vma->vm_file || !vma->anon_vma); 198 VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
199 vma);
199 /* See comment in move_ptes() */ 200 /* See comment in move_ptes() */
200 if (need_rmap_locks) 201 if (need_rmap_locks)
201 anon_vma_lock_write(vma->anon_vma); 202 anon_vma_lock_write(vma->anon_vma);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7ed58602e71b..7c7ab32ee503 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void)
119 phys_addr_t start, end; 119 phys_addr_t start, end;
120 u64 i; 120 u64 i;
121 121
122 memblock_clear_hotplug(0, -1);
123
122 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) 124 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
123 count += __free_memory_core(start, end); 125 count += __free_memory_core(start, end);
124 126
diff --git a/mm/nommu.c b/mm/nommu.c
index a881d9673c6b..bd1808e194a7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -539,7 +539,7 @@ void __init mmap_init(void)
539{ 539{
540 int ret; 540 int ret;
541 541
542 ret = percpu_counter_init(&vm_committed_as, 0); 542 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
543 VM_BUG_ON(ret); 543 VM_BUG_ON(ret);
544 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 544 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
545} 545}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e11df8fa7ec..bbf405a3a18f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
565 565
566 spin_lock(&zone_scan_lock); 566 spin_lock(&zone_scan_lock);
567 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) 567 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
568 if (zone_is_oom_locked(zone)) { 568 if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
569 ret = false; 569 ret = false;
570 goto out; 570 goto out;
571 } 571 }
@@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
575 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. 575 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
576 */ 576 */
577 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) 577 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
578 zone_set_flag(zone, ZONE_OOM_LOCKED); 578 set_bit(ZONE_OOM_LOCKED, &zone->flags);
579 579
580out: 580out:
581 spin_unlock(&zone_scan_lock); 581 spin_unlock(&zone_scan_lock);
@@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
594 594
595 spin_lock(&zone_scan_lock); 595 spin_lock(&zone_scan_lock);
596 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) 596 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
597 zone_clear_flag(zone, ZONE_OOM_LOCKED); 597 clear_bit(ZONE_OOM_LOCKED, &zone->flags);
598 spin_unlock(&zone_scan_lock); 598 spin_unlock(&zone_scan_lock);
599} 599}
600 600
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 91d73ef1744d..ff24c9d83112 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
1075 } 1075 }
1076 1076
1077 if (dirty < setpoint) { 1077 if (dirty < setpoint) {
1078 x = min(bdi->balanced_dirty_ratelimit, 1078 x = min3(bdi->balanced_dirty_ratelimit,
1079 min(balanced_dirty_ratelimit, task_ratelimit)); 1079 balanced_dirty_ratelimit, task_ratelimit);
1080 if (dirty_ratelimit < x) 1080 if (dirty_ratelimit < x)
1081 step = x - dirty_ratelimit; 1081 step = x - dirty_ratelimit;
1082 } else { 1082 } else {
1083 x = max(bdi->balanced_dirty_ratelimit, 1083 x = max3(bdi->balanced_dirty_ratelimit,
1084 max(balanced_dirty_ratelimit, task_ratelimit)); 1084 balanced_dirty_ratelimit, task_ratelimit);
1085 if (dirty_ratelimit > x) 1085 if (dirty_ratelimit > x)
1086 step = dirty_ratelimit - x; 1086 step = dirty_ratelimit - x;
1087 } 1087 }
@@ -1777,7 +1777,7 @@ void __init page_writeback_init(void)
1777 writeback_set_ratelimit(); 1777 writeback_set_ratelimit();
1778 register_cpu_notifier(&ratelimit_nb); 1778 register_cpu_notifier(&ratelimit_nb);
1779 1779
1780 fprop_global_init(&writeout_completions); 1780 fprop_global_init(&writeout_completions, GFP_KERNEL);
1781} 1781}
1782 1782
1783/** 1783/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18cee0d4c8a2..c9710c9bbee2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,8 +53,6 @@
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/compaction.h> 54#include <linux/compaction.h>
55#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h>
58#include <linux/prefetch.h> 56#include <linux/prefetch.h>
59#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
60#include <linux/migrate.h> 58#include <linux/migrate.h>
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
85 */ 83 */
86DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 84DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87EXPORT_PER_CPU_SYMBOL(_numa_mem_); 85EXPORT_PER_CPU_SYMBOL(_numa_mem_);
86int _node_numa_mem_[MAX_NUMNODES];
88#endif 87#endif
89 88
90/* 89/*
@@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone,
1014 * Remove at a later date when no bug reports exist related to 1013 * Remove at a later date when no bug reports exist related to
1015 * grouping pages by mobility 1014 * grouping pages by mobility
1016 */ 1015 */
1017 BUG_ON(page_zone(start_page) != page_zone(end_page)); 1016 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1018#endif 1017#endif
1019 1018
1020 for (page = start_page; page <= end_page;) { 1019 for (page = start_page; page <= end_page;) {
@@ -1612,9 +1611,9 @@ again:
1612 } 1611 }
1613 1612
1614 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1613 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1615 if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && 1614 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
1616 !zone_is_fair_depleted(zone)) 1615 !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
1617 zone_set_flag(zone, ZONE_FAIR_DEPLETED); 1616 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1618 1617
1619 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1618 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1620 zone_statistics(preferred_zone, zone, gfp_flags); 1619 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
1934 mod_zone_page_state(zone, NR_ALLOC_BATCH, 1933 mod_zone_page_state(zone, NR_ALLOC_BATCH,
1935 high_wmark_pages(zone) - low_wmark_pages(zone) - 1934 high_wmark_pages(zone) - low_wmark_pages(zone) -
1936 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 1935 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
1937 zone_clear_flag(zone, ZONE_FAIR_DEPLETED); 1936 clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1938 } while (zone++ != preferred_zone); 1937 } while (zone++ != preferred_zone);
1939} 1938}
1940 1939
@@ -1985,7 +1984,7 @@ zonelist_scan:
1985 if (alloc_flags & ALLOC_FAIR) { 1984 if (alloc_flags & ALLOC_FAIR) {
1986 if (!zone_local(preferred_zone, zone)) 1985 if (!zone_local(preferred_zone, zone))
1987 break; 1986 break;
1988 if (zone_is_fair_depleted(zone)) { 1987 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
1989 nr_fair_skipped++; 1988 nr_fair_skipped++;
1990 continue; 1989 continue;
1991 } 1990 }
@@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2296 struct zonelist *zonelist, enum zone_type high_zoneidx, 2295 struct zonelist *zonelist, enum zone_type high_zoneidx,
2297 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2296 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2298 int classzone_idx, int migratetype, enum migrate_mode mode, 2297 int classzone_idx, int migratetype, enum migrate_mode mode,
2299 bool *contended_compaction, bool *deferred_compaction, 2298 int *contended_compaction, bool *deferred_compaction)
2300 unsigned long *did_some_progress)
2301{ 2299{
2302 if (!order) 2300 struct zone *last_compact_zone = NULL;
2303 return NULL; 2301 unsigned long compact_result;
2302 struct page *page;
2304 2303
2305 if (compaction_deferred(preferred_zone, order)) { 2304 if (!order)
2306 *deferred_compaction = true;
2307 return NULL; 2305 return NULL;
2308 }
2309 2306
2310 current->flags |= PF_MEMALLOC; 2307 current->flags |= PF_MEMALLOC;
2311 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2308 compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
2312 nodemask, mode, 2309 nodemask, mode,
2313 contended_compaction); 2310 contended_compaction,
2311 &last_compact_zone);
2314 current->flags &= ~PF_MEMALLOC; 2312 current->flags &= ~PF_MEMALLOC;
2315 2313
2316 if (*did_some_progress != COMPACT_SKIPPED) { 2314 switch (compact_result) {
2317 struct page *page; 2315 case COMPACT_DEFERRED:
2316 *deferred_compaction = true;
2317 /* fall-through */
2318 case COMPACT_SKIPPED:
2319 return NULL;
2320 default:
2321 break;
2322 }
2318 2323
2319 /* Page migration frees to the PCP lists but we want merging */ 2324 /*
2320 drain_pages(get_cpu()); 2325 * At least in one zone compaction wasn't deferred or skipped, so let's
2321 put_cpu(); 2326 * count a compaction stall
2327 */
2328 count_vm_event(COMPACTSTALL);
2322 2329
2323 page = get_page_from_freelist(gfp_mask, nodemask, 2330 /* Page migration frees to the PCP lists but we want merging */
2324 order, zonelist, high_zoneidx, 2331 drain_pages(get_cpu());
2325 alloc_flags & ~ALLOC_NO_WATERMARKS, 2332 put_cpu();
2326 preferred_zone, classzone_idx, migratetype);
2327 if (page) {
2328 preferred_zone->compact_blockskip_flush = false;
2329 compaction_defer_reset(preferred_zone, order, true);
2330 count_vm_event(COMPACTSUCCESS);
2331 return page;
2332 }
2333 2333
2334 /* 2334 page = get_page_from_freelist(gfp_mask, nodemask,
2335 * It's bad if compaction run occurs and fails. 2335 order, zonelist, high_zoneidx,
2336 * The most likely reason is that pages exist, 2336 alloc_flags & ~ALLOC_NO_WATERMARKS,
2337 * but not enough to satisfy watermarks. 2337 preferred_zone, classzone_idx, migratetype);
2338 */
2339 count_vm_event(COMPACTFAIL);
2340 2338
2341 /* 2339 if (page) {
2342 * As async compaction considers a subset of pageblocks, only 2340 struct zone *zone = page_zone(page);
2343 * defer if the failure was a sync compaction failure.
2344 */
2345 if (mode != MIGRATE_ASYNC)
2346 defer_compaction(preferred_zone, order);
2347 2341
2348 cond_resched(); 2342 zone->compact_blockskip_flush = false;
2343 compaction_defer_reset(zone, order, true);
2344 count_vm_event(COMPACTSUCCESS);
2345 return page;
2349 } 2346 }
2350 2347
2348 /*
2349 * last_compact_zone is where try_to_compact_pages thought allocation
2350 * should succeed, so it did not defer compaction. But here we know
2351 * that it didn't succeed, so we do the defer.
2352 */
2353 if (last_compact_zone && mode != MIGRATE_ASYNC)
2354 defer_compaction(last_compact_zone, order);
2355
2356 /*
2357 * It's bad if compaction run occurs and fails. The most likely reason
2358 * is that pages exist, but not enough to satisfy watermarks.
2359 */
2360 count_vm_event(COMPACTFAIL);
2361
2362 cond_resched();
2363
2351 return NULL; 2364 return NULL;
2352} 2365}
2353#else 2366#else
@@ -2355,9 +2368,8 @@ static inline struct page *
2355__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2368__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2356 struct zonelist *zonelist, enum zone_type high_zoneidx, 2369 struct zonelist *zonelist, enum zone_type high_zoneidx,
2357 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2370 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2358 int classzone_idx, int migratetype, 2371 int classzone_idx, int migratetype, enum migrate_mode mode,
2359 enum migrate_mode mode, bool *contended_compaction, 2372 int *contended_compaction, bool *deferred_compaction)
2360 bool *deferred_compaction, unsigned long *did_some_progress)
2361{ 2373{
2362 return NULL; 2374 return NULL;
2363} 2375}
@@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2457static void wake_all_kswapds(unsigned int order, 2469static void wake_all_kswapds(unsigned int order,
2458 struct zonelist *zonelist, 2470 struct zonelist *zonelist,
2459 enum zone_type high_zoneidx, 2471 enum zone_type high_zoneidx,
2460 struct zone *preferred_zone) 2472 struct zone *preferred_zone,
2473 nodemask_t *nodemask)
2461{ 2474{
2462 struct zoneref *z; 2475 struct zoneref *z;
2463 struct zone *zone; 2476 struct zone *zone;
2464 2477
2465 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2478 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2479 high_zoneidx, nodemask)
2466 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2480 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2467} 2481}
2468 2482
@@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2509 alloc_flags |= ALLOC_NO_WATERMARKS; 2523 alloc_flags |= ALLOC_NO_WATERMARKS;
2510 } 2524 }
2511#ifdef CONFIG_CMA 2525#ifdef CONFIG_CMA
2512 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2526 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2513 alloc_flags |= ALLOC_CMA; 2527 alloc_flags |= ALLOC_CMA;
2514#endif 2528#endif
2515 return alloc_flags; 2529 return alloc_flags;
@@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2533 unsigned long did_some_progress; 2547 unsigned long did_some_progress;
2534 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2548 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2535 bool deferred_compaction = false; 2549 bool deferred_compaction = false;
2536 bool contended_compaction = false; 2550 int contended_compaction = COMPACT_CONTENDED_NONE;
2537 2551
2538 /* 2552 /*
2539 * In the slowpath, we sanity check order to avoid ever trying to 2553 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2560 2574
2561restart: 2575restart:
2562 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2576 if (!(gfp_mask & __GFP_NO_KSWAPD))
2563 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2577 wake_all_kswapds(order, zonelist, high_zoneidx,
2578 preferred_zone, nodemask);
2564 2579
2565 /* 2580 /*
2566 * OK, we're below the kswapd watermark and have kicked background 2581 * OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2648,40 @@ rebalance:
2633 preferred_zone, 2648 preferred_zone,
2634 classzone_idx, migratetype, 2649 classzone_idx, migratetype,
2635 migration_mode, &contended_compaction, 2650 migration_mode, &contended_compaction,
2636 &deferred_compaction, 2651 &deferred_compaction);
2637 &did_some_progress);
2638 if (page) 2652 if (page)
2639 goto got_pg; 2653 goto got_pg;
2640 2654
2641 /* 2655 /* Checks for THP-specific high-order allocations */
2642 * If compaction is deferred for high-order allocations, it is because 2656 if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
2643 * sync compaction recently failed. In this is the case and the caller 2657 /*
2644 * requested a movable allocation that does not heavily disrupt the 2658 * If compaction is deferred for high-order allocations, it is
2645 * system then fail the allocation instead of entering direct reclaim. 2659 * because sync compaction recently failed. If this is the case
2646 */ 2660 * and the caller requested a THP allocation, we do not want
2647 if ((deferred_compaction || contended_compaction) && 2661 * to heavily disrupt the system, so we fail the allocation
2648 (gfp_mask & __GFP_NO_KSWAPD)) 2662 * instead of entering direct reclaim.
2649 goto nopage; 2663 */
2664 if (deferred_compaction)
2665 goto nopage;
2666
2667 /*
2668 * In all zones where compaction was attempted (and not
2669 * deferred or skipped), lock contention has been detected.
2670 * For THP allocation we do not want to disrupt the others
2671 * so we fallback to base pages instead.
2672 */
2673 if (contended_compaction == COMPACT_CONTENDED_LOCK)
2674 goto nopage;
2675
2676 /*
2677 * If compaction was aborted due to need_resched(), we do not
2678 * want to further increase allocation latency, unless it is
2679 * khugepaged trying to collapse.
2680 */
2681 if (contended_compaction == COMPACT_CONTENDED_SCHED
2682 && !(current->flags & PF_KTHREAD))
2683 goto nopage;
2684 }
2650 2685
2651 /* 2686 /*
2652 * It can become very expensive to allocate transparent hugepages at 2687 * It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2761,7 @@ rebalance:
2726 preferred_zone, 2761 preferred_zone,
2727 classzone_idx, migratetype, 2762 classzone_idx, migratetype,
2728 migration_mode, &contended_compaction, 2763 migration_mode, &contended_compaction,
2729 &deferred_compaction, 2764 &deferred_compaction);
2730 &did_some_progress);
2731 if (page) 2765 if (page)
2732 goto got_pg; 2766 goto got_pg;
2733 } 2767 }
@@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2753 struct zone *preferred_zone; 2787 struct zone *preferred_zone;
2754 struct zoneref *preferred_zoneref; 2788 struct zoneref *preferred_zoneref;
2755 struct page *page = NULL; 2789 struct page *page = NULL;
2756 int migratetype = allocflags_to_migratetype(gfp_mask); 2790 int migratetype = gfpflags_to_migratetype(gfp_mask);
2757 unsigned int cpuset_mems_cookie; 2791 unsigned int cpuset_mems_cookie;
2758 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2792 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2759 int classzone_idx; 2793 int classzone_idx;
@@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2775 if (unlikely(!zonelist->_zonerefs->zone)) 2809 if (unlikely(!zonelist->_zonerefs->zone))
2776 return NULL; 2810 return NULL;
2777 2811
2812 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
2813 alloc_flags |= ALLOC_CMA;
2814
2778retry_cpuset: 2815retry_cpuset:
2779 cpuset_mems_cookie = read_mems_allowed_begin(); 2816 cpuset_mems_cookie = read_mems_allowed_begin();
2780 2817
@@ -2786,10 +2823,6 @@ retry_cpuset:
2786 goto out; 2823 goto out;
2787 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2824 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2788 2825
2789#ifdef CONFIG_CMA
2790 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2791 alloc_flags |= ALLOC_CMA;
2792#endif
2793 /* First allocation attempt */ 2826 /* First allocation attempt */
2794 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2827 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2795 zonelist, high_zoneidx, alloc_flags, 2828 zonelist, high_zoneidx, alloc_flags,
@@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3579 zonelist->_zonerefs[pos].zone_idx = 0; 3612 zonelist->_zonerefs[pos].zone_idx = 0;
3580} 3613}
3581 3614
3615#if defined(CONFIG_64BIT)
3616/*
3617 * Devices that require DMA32/DMA are relatively rare and do not justify a
3618 * penalty to every machine in case the specialised case applies. Default
3619 * to Node-ordering on 64-bit NUMA machines
3620 */
3621static int default_zonelist_order(void)
3622{
3623 return ZONELIST_ORDER_NODE;
3624}
3625#else
3626/*
3627 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
3628 * by the kernel. If processes running on node 0 deplete the low memory zone
3629 * then reclaim will occur more frequency increasing stalls and potentially
3630 * be easier to OOM if a large percentage of the zone is under writeback or
3631 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
3632 * Hence, default to zone ordering on 32-bit.
3633 */
3582static int default_zonelist_order(void) 3634static int default_zonelist_order(void)
3583{ 3635{
3584 int nid, zone_type;
3585 unsigned long low_kmem_size, total_size;
3586 struct zone *z;
3587 int average_size;
3588 /*
3589 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3590 * If they are really small and used heavily, the system can fall
3591 * into OOM very easily.
3592 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3593 */
3594 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3595 low_kmem_size = 0;
3596 total_size = 0;
3597 for_each_online_node(nid) {
3598 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3599 z = &NODE_DATA(nid)->node_zones[zone_type];
3600 if (populated_zone(z)) {
3601 if (zone_type < ZONE_NORMAL)
3602 low_kmem_size += z->managed_pages;
3603 total_size += z->managed_pages;
3604 } else if (zone_type == ZONE_NORMAL) {
3605 /*
3606 * If any node has only lowmem, then node order
3607 * is preferred to allow kernel allocations
3608 * locally; otherwise, they can easily infringe
3609 * on other nodes when there is an abundance of
3610 * lowmem available to allocate from.
3611 */
3612 return ZONELIST_ORDER_NODE;
3613 }
3614 }
3615 }
3616 if (!low_kmem_size || /* there are no DMA area. */
3617 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3618 return ZONELIST_ORDER_NODE;
3619 /*
3620 * look into each node's config.
3621 * If there is a node whose DMA/DMA32 memory is very big area on
3622 * local memory, NODE_ORDER may be suitable.
3623 */
3624 average_size = total_size /
3625 (nodes_weight(node_states[N_MEMORY]) + 1);
3626 for_each_online_node(nid) {
3627 low_kmem_size = 0;
3628 total_size = 0;
3629 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3630 z = &NODE_DATA(nid)->node_zones[zone_type];
3631 if (populated_zone(z)) {
3632 if (zone_type < ZONE_NORMAL)
3633 low_kmem_size += z->present_pages;
3634 total_size += z->present_pages;
3635 }
3636 }
3637 if (low_kmem_size &&
3638 total_size > average_size && /* ignore small node */
3639 low_kmem_size > total_size * 70/100)
3640 return ZONELIST_ORDER_NODE;
3641 }
3642 return ZONELIST_ORDER_ZONE; 3636 return ZONELIST_ORDER_ZONE;
3643} 3637}
3638#endif /* CONFIG_64BIT */
3644 3639
3645static void set_zonelist_order(void) 3640static void set_zonelist_order(void)
3646{ 3641{
@@ -5701,9 +5696,8 @@ static void __setup_per_zone_wmarks(void)
5701 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5696 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5702 5697
5703 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5698 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5704 high_wmark_pages(zone) - 5699 high_wmark_pages(zone) - low_wmark_pages(zone) -
5705 low_wmark_pages(zone) - 5700 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
5706 zone_page_state(zone, NR_ALLOC_BATCH));
5707 5701
5708 setup_zone_migrate_reserve(zone); 5702 setup_zone_migrate_reserve(zone);
5709 spin_unlock_irqrestore(&zone->lock, flags); 5703 spin_unlock_irqrestore(&zone->lock, flags);
@@ -6278,8 +6272,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
6278 6272
6279 if (list_empty(&cc->migratepages)) { 6273 if (list_empty(&cc->migratepages)) {
6280 cc->nr_migratepages = 0; 6274 cc->nr_migratepages = 0;
6281 pfn = isolate_migratepages_range(cc->zone, cc, 6275 pfn = isolate_migratepages_range(cc, pfn, end);
6282 pfn, end, true);
6283 if (!pfn) { 6276 if (!pfn) {
6284 ret = -EINTR; 6277 ret = -EINTR;
6285 break; 6278 break;
@@ -6555,97 +6548,3 @@ bool is_free_buddy_page(struct page *page)
6555 return order < MAX_ORDER; 6548 return order < MAX_ORDER;
6556} 6549}
6557#endif 6550#endif
6558
6559static const struct trace_print_flags pageflag_names[] = {
6560 {1UL << PG_locked, "locked" },
6561 {1UL << PG_error, "error" },
6562 {1UL << PG_referenced, "referenced" },
6563 {1UL << PG_uptodate, "uptodate" },
6564 {1UL << PG_dirty, "dirty" },
6565 {1UL << PG_lru, "lru" },
6566 {1UL << PG_active, "active" },
6567 {1UL << PG_slab, "slab" },
6568 {1UL << PG_owner_priv_1, "owner_priv_1" },
6569 {1UL << PG_arch_1, "arch_1" },
6570 {1UL << PG_reserved, "reserved" },
6571 {1UL << PG_private, "private" },
6572 {1UL << PG_private_2, "private_2" },
6573 {1UL << PG_writeback, "writeback" },
6574#ifdef CONFIG_PAGEFLAGS_EXTENDED
6575 {1UL << PG_head, "head" },
6576 {1UL << PG_tail, "tail" },
6577#else
6578 {1UL << PG_compound, "compound" },
6579#endif
6580 {1UL << PG_swapcache, "swapcache" },
6581 {1UL << PG_mappedtodisk, "mappedtodisk" },
6582 {1UL << PG_reclaim, "reclaim" },
6583 {1UL << PG_swapbacked, "swapbacked" },
6584 {1UL << PG_unevictable, "unevictable" },
6585#ifdef CONFIG_MMU
6586 {1UL << PG_mlocked, "mlocked" },
6587#endif
6588#ifdef CONFIG_ARCH_USES_PG_UNCACHED
6589 {1UL << PG_uncached, "uncached" },
6590#endif
6591#ifdef CONFIG_MEMORY_FAILURE
6592 {1UL << PG_hwpoison, "hwpoison" },
6593#endif
6594#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6595 {1UL << PG_compound_lock, "compound_lock" },
6596#endif
6597};
6598
6599static void dump_page_flags(unsigned long flags)
6600{
6601 const char *delim = "";
6602 unsigned long mask;
6603 int i;
6604
6605 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6606
6607 printk(KERN_ALERT "page flags: %#lx(", flags);
6608
6609 /* remove zone id */
6610 flags &= (1UL << NR_PAGEFLAGS) - 1;
6611
6612 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6613
6614 mask = pageflag_names[i].mask;
6615 if ((flags & mask) != mask)
6616 continue;
6617
6618 flags &= ~mask;
6619 printk("%s%s", delim, pageflag_names[i].name);
6620 delim = "|";
6621 }
6622
6623 /* check for left over flags */
6624 if (flags)
6625 printk("%s%#lx", delim, flags);
6626
6627 printk(")\n");
6628}
6629
6630void dump_page_badflags(struct page *page, const char *reason,
6631 unsigned long badflags)
6632{
6633 printk(KERN_ALERT
6634 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6635 page, atomic_read(&page->_count), page_mapcount(page),
6636 page->mapping, page->index);
6637 dump_page_flags(page->flags);
6638 if (reason)
6639 pr_alert("page dumped because: %s\n", reason);
6640 if (page->flags & badflags) {
6641 pr_alert("bad because of flags:\n");
6642 dump_page_flags(page->flags & badflags);
6643 }
6644 mem_cgroup_print_bad_page(page);
6645}
6646
6647void dump_page(struct page *page, const char *reason)
6648{
6649 dump_page_badflags(page, reason, 0);
6650}
6651EXPORT_SYMBOL(dump_page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2beeabf502c5..ad83195521f2 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
177 if (!walk->mm) 177 if (!walk->mm)
178 return -EINVAL; 178 return -EINVAL;
179 179
180 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 180 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
181 181
182 pgd = pgd_offset(walk->mm, addr); 182 pgd = pgd_offset(walk->mm, addr);
183 do { 183 do {
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 89633fefc6a2..10e3d0b8a86d 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -33,17 +33,14 @@
33 33
34#include <linux/log2.h> 34#include <linux/log2.h>
35 35
36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 36static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
37 int page_start, int page_end)
37{ 38{
38 unsigned int cpu;
39
40 for_each_possible_cpu(cpu)
41 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
42
43 return 0; 39 return 0;
44} 40}
45 41
46static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 42static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
43 int page_start, int page_end)
47{ 44{
48 /* nada */ 45 /* nada */
49} 46}
@@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
70 67
71 chunk->data = pages; 68 chunk->data = pages;
72 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; 69 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
70
71 spin_lock_irq(&pcpu_lock);
72 pcpu_chunk_populated(chunk, 0, nr_pages);
73 spin_unlock_irq(&pcpu_lock);
74
73 return chunk; 75 return chunk;
74} 76}
75 77
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 3707c71ae4cd..538998a137d2 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
20} 20}
21 21
22/** 22/**
23 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap 23 * pcpu_get_pages - get temp pages array
24 * @chunk: chunk of interest 24 * @chunk: chunk of interest
25 * @bitmapp: output parameter for bitmap
26 * @may_alloc: may allocate the array
27 * 25 *
28 * Returns pointer to array of pointers to struct page and bitmap, 26 * Returns pointer to array of pointers to struct page which can be indexed
29 * both of which can be indexed with pcpu_page_idx(). The returned 27 * with pcpu_page_idx(). Note that there is only one array and accesses
30 * array is cleared to zero and *@bitmapp is copied from 28 * should be serialized by pcpu_alloc_mutex.
31 * @chunk->populated. Note that there is only one array and bitmap
32 * and access exclusion is the caller's responsibility.
33 *
34 * CONTEXT:
35 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
36 * Otherwise, don't care.
37 * 29 *
38 * RETURNS: 30 * RETURNS:
39 * Pointer to temp pages array on success, NULL on failure. 31 * Pointer to temp pages array on success.
40 */ 32 */
41static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, 33static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc)
42 unsigned long **bitmapp,
43 bool may_alloc)
44{ 34{
45 static struct page **pages; 35 static struct page **pages;
46 static unsigned long *bitmap;
47 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); 36 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
48 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
49 sizeof(unsigned long);
50
51 if (!pages || !bitmap) {
52 if (may_alloc && !pages)
53 pages = pcpu_mem_zalloc(pages_size);
54 if (may_alloc && !bitmap)
55 bitmap = pcpu_mem_zalloc(bitmap_size);
56 if (!pages || !bitmap)
57 return NULL;
58 }
59 37
60 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); 38 lockdep_assert_held(&pcpu_alloc_mutex);
61 39
62 *bitmapp = bitmap; 40 if (!pages)
41 pages = pcpu_mem_zalloc(pages_size);
63 return pages; 42 return pages;
64} 43}
65 44
@@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
67 * pcpu_free_pages - free pages which were allocated for @chunk 46 * pcpu_free_pages - free pages which were allocated for @chunk
68 * @chunk: chunk pages were allocated for 47 * @chunk: chunk pages were allocated for
69 * @pages: array of pages to be freed, indexed by pcpu_page_idx() 48 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
70 * @populated: populated bitmap
71 * @page_start: page index of the first page to be freed 49 * @page_start: page index of the first page to be freed
72 * @page_end: page index of the last page to be freed + 1 50 * @page_end: page index of the last page to be freed + 1
73 * 51 *
@@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
75 * The pages were allocated for @chunk. 53 * The pages were allocated for @chunk.
76 */ 54 */
77static void pcpu_free_pages(struct pcpu_chunk *chunk, 55static void pcpu_free_pages(struct pcpu_chunk *chunk,
78 struct page **pages, unsigned long *populated, 56 struct page **pages, int page_start, int page_end)
79 int page_start, int page_end)
80{ 57{
81 unsigned int cpu; 58 unsigned int cpu;
82 int i; 59 int i;
@@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
95 * pcpu_alloc_pages - allocates pages for @chunk 72 * pcpu_alloc_pages - allocates pages for @chunk
96 * @chunk: target chunk 73 * @chunk: target chunk
97 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() 74 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
98 * @populated: populated bitmap
99 * @page_start: page index of the first page to be allocated 75 * @page_start: page index of the first page to be allocated
100 * @page_end: page index of the last page to be allocated + 1 76 * @page_end: page index of the last page to be allocated + 1
101 * 77 *
@@ -104,11 +80,10 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
104 * content of @pages and will pass it verbatim to pcpu_map_pages(). 80 * content of @pages and will pass it verbatim to pcpu_map_pages().
105 */ 81 */
106static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 82static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
107 struct page **pages, unsigned long *populated, 83 struct page **pages, int page_start, int page_end)
108 int page_start, int page_end)
109{ 84{
110 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 85 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
111 unsigned int cpu; 86 unsigned int cpu, tcpu;
112 int i; 87 int i;
113 88
114 for_each_possible_cpu(cpu) { 89 for_each_possible_cpu(cpu) {
@@ -116,14 +91,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
116 struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; 91 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
117 92
118 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); 93 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
119 if (!*pagep) { 94 if (!*pagep)
120 pcpu_free_pages(chunk, pages, populated, 95 goto err;
121 page_start, page_end);
122 return -ENOMEM;
123 }
124 } 96 }
125 } 97 }
126 return 0; 98 return 0;
99
100err:
101 while (--i >= page_start)
102 __free_page(pages[pcpu_page_idx(cpu, i)]);
103
104 for_each_possible_cpu(tcpu) {
105 if (tcpu == cpu)
106 break;
107 for (i = page_start; i < page_end; i++)
108 __free_page(pages[pcpu_page_idx(tcpu, i)]);
109 }
110 return -ENOMEM;
127} 111}
128 112
129/** 113/**
@@ -155,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
155 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 139 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
156 * @chunk: chunk of interest 140 * @chunk: chunk of interest
157 * @pages: pages array which can be used to pass information to free 141 * @pages: pages array which can be used to pass information to free
158 * @populated: populated bitmap
159 * @page_start: page index of the first page to unmap 142 * @page_start: page index of the first page to unmap
160 * @page_end: page index of the last page to unmap + 1 143 * @page_end: page index of the last page to unmap + 1
161 * 144 *
@@ -166,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
166 * proper pre/post flush functions. 149 * proper pre/post flush functions.
167 */ 150 */
168static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 151static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
169 struct page **pages, unsigned long *populated, 152 struct page **pages, int page_start, int page_end)
170 int page_start, int page_end)
171{ 153{
172 unsigned int cpu; 154 unsigned int cpu;
173 int i; 155 int i;
@@ -183,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
183 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), 165 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
184 page_end - page_start); 166 page_end - page_start);
185 } 167 }
186
187 bitmap_clear(populated, page_start, page_end - page_start);
188} 168}
189 169
190/** 170/**
@@ -219,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
219 * pcpu_map_pages - map pages into a pcpu_chunk 199 * pcpu_map_pages - map pages into a pcpu_chunk
220 * @chunk: chunk of interest 200 * @chunk: chunk of interest
221 * @pages: pages array containing pages to be mapped 201 * @pages: pages array containing pages to be mapped
222 * @populated: populated bitmap
223 * @page_start: page index of the first page to map 202 * @page_start: page index of the first page to map
224 * @page_end: page index of the last page to map + 1 203 * @page_end: page index of the last page to map + 1
225 * 204 *
@@ -227,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
227 * caller is responsible for calling pcpu_post_map_flush() after all 206 * caller is responsible for calling pcpu_post_map_flush() after all
228 * mappings are complete. 207 * mappings are complete.
229 * 208 *
230 * This function is responsible for setting corresponding bits in 209 * This function is responsible for setting up whatever is necessary for
231 * @chunk->populated bitmap and whatever is necessary for reverse 210 * reverse lookup (addr -> chunk).
232 * lookup (addr -> chunk).
233 */ 211 */
234static int pcpu_map_pages(struct pcpu_chunk *chunk, 212static int pcpu_map_pages(struct pcpu_chunk *chunk,
235 struct page **pages, unsigned long *populated, 213 struct page **pages, int page_start, int page_end)
236 int page_start, int page_end)
237{ 214{
238 unsigned int cpu, tcpu; 215 unsigned int cpu, tcpu;
239 int i, err; 216 int i, err;
@@ -244,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk,
244 page_end - page_start); 221 page_end - page_start);
245 if (err < 0) 222 if (err < 0)
246 goto err; 223 goto err;
247 }
248 224
249 /* mapping successful, link chunk and mark populated */ 225 for (i = page_start; i < page_end; i++)
250 for (i = page_start; i < page_end; i++) {
251 for_each_possible_cpu(cpu)
252 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], 226 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
253 chunk); 227 chunk);
254 __set_bit(i, populated);
255 } 228 }
256
257 return 0; 229 return 0;
258
259err: 230err:
260 for_each_possible_cpu(tcpu) { 231 for_each_possible_cpu(tcpu) {
261 if (tcpu == cpu) 232 if (tcpu == cpu)
@@ -263,6 +234,7 @@ err:
263 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), 234 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
264 page_end - page_start); 235 page_end - page_start);
265 } 236 }
237 pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
266 return err; 238 return err;
267} 239}
268 240
@@ -289,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
289/** 261/**
290 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 262 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
291 * @chunk: chunk of interest 263 * @chunk: chunk of interest
292 * @off: offset to the area to populate 264 * @page_start: the start page
293 * @size: size of the area to populate in bytes 265 * @page_end: the end page
294 * 266 *
295 * For each cpu, populate and map pages [@page_start,@page_end) into 267 * For each cpu, populate and map pages [@page_start,@page_end) into
296 * @chunk. The area is cleared on return. 268 * @chunk.
297 * 269 *
298 * CONTEXT: 270 * CONTEXT:
299 * pcpu_alloc_mutex, does GFP_KERNEL allocation. 271 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
300 */ 272 */
301static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 273static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
274 int page_start, int page_end)
302{ 275{
303 int page_start = PFN_DOWN(off);
304 int page_end = PFN_UP(off + size);
305 int free_end = page_start, unmap_end = page_start;
306 struct page **pages; 276 struct page **pages;
307 unsigned long *populated;
308 unsigned int cpu;
309 int rs, re, rc;
310
311 /* quick path, check whether all pages are already there */
312 rs = page_start;
313 pcpu_next_pop(chunk, &rs, &re, page_end);
314 if (rs == page_start && re == page_end)
315 goto clear;
316 277
317 /* need to allocate and map pages, this chunk can't be immutable */ 278 pages = pcpu_get_pages(chunk);
318 WARN_ON(chunk->immutable);
319
320 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
321 if (!pages) 279 if (!pages)
322 return -ENOMEM; 280 return -ENOMEM;
323 281
324 /* alloc and map */ 282 if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
325 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 283 return -ENOMEM;
326 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
327 if (rc)
328 goto err_free;
329 free_end = re;
330 }
331 284
332 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 285 if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
333 rc = pcpu_map_pages(chunk, pages, populated, rs, re); 286 pcpu_free_pages(chunk, pages, page_start, page_end);
334 if (rc) 287 return -ENOMEM;
335 goto err_unmap;
336 unmap_end = re;
337 } 288 }
338 pcpu_post_map_flush(chunk, page_start, page_end); 289 pcpu_post_map_flush(chunk, page_start, page_end);
339 290
340 /* commit new bitmap */
341 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
342clear:
343 for_each_possible_cpu(cpu)
344 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
345 return 0; 291 return 0;
346
347err_unmap:
348 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
349 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
350 pcpu_unmap_pages(chunk, pages, populated, rs, re);
351 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
352err_free:
353 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
354 pcpu_free_pages(chunk, pages, populated, rs, re);
355 return rc;
356} 292}
357 293
358/** 294/**
359 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 295 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
360 * @chunk: chunk to depopulate 296 * @chunk: chunk to depopulate
361 * @off: offset to the area to depopulate 297 * @page_start: the start page
362 * @size: size of the area to depopulate in bytes 298 * @page_end: the end page
363 * 299 *
364 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 300 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
365 * from @chunk. If @flush is true, vcache is flushed before unmapping 301 * from @chunk.
366 * and tlb after.
367 * 302 *
368 * CONTEXT: 303 * CONTEXT:
369 * pcpu_alloc_mutex. 304 * pcpu_alloc_mutex.
370 */ 305 */
371static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 306static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
307 int page_start, int page_end)
372{ 308{
373 int page_start = PFN_DOWN(off);
374 int page_end = PFN_UP(off + size);
375 struct page **pages; 309 struct page **pages;
376 unsigned long *populated;
377 int rs, re;
378
379 /* quick path, check whether it's empty already */
380 rs = page_start;
381 pcpu_next_unpop(chunk, &rs, &re, page_end);
382 if (rs == page_start && re == page_end)
383 return;
384
385 /* immutable chunks can't be depopulated */
386 WARN_ON(chunk->immutable);
387 310
388 /* 311 /*
389 * If control reaches here, there must have been at least one 312 * If control reaches here, there must have been at least one
390 * successful population attempt so the temp pages array must 313 * successful population attempt so the temp pages array must
391 * be available now. 314 * be available now.
392 */ 315 */
393 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); 316 pages = pcpu_get_pages(chunk);
394 BUG_ON(!pages); 317 BUG_ON(!pages);
395 318
396 /* unmap and free */ 319 /* unmap and free */
397 pcpu_pre_unmap_flush(chunk, page_start, page_end); 320 pcpu_pre_unmap_flush(chunk, page_start, page_end);
398 321
399 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 322 pcpu_unmap_pages(chunk, pages, page_start, page_end);
400 pcpu_unmap_pages(chunk, pages, populated, rs, re);
401 323
402 /* no need to flush tlb, vmalloc will handle it lazily */ 324 /* no need to flush tlb, vmalloc will handle it lazily */
403 325
404 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 326 pcpu_free_pages(chunk, pages, page_start, page_end);
405 pcpu_free_pages(chunk, pages, populated, rs, re);
406
407 /* commit new bitmap */
408 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
409} 327}
410 328
411static struct pcpu_chunk *pcpu_create_chunk(void) 329static struct pcpu_chunk *pcpu_create_chunk(void)
diff --git a/mm/percpu.c b/mm/percpu.c
index 2139e30a4b44..014bab65e0ff 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,10 @@
76 76
77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
79 83
80#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
81/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 85/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -102,12 +106,16 @@ struct pcpu_chunk {
102 int free_size; /* free bytes in the chunk */ 106 int free_size; /* free bytes in the chunk */
103 int contig_hint; /* max contiguous size hint */ 107 int contig_hint; /* max contiguous size hint */
104 void *base_addr; /* base address of this chunk */ 108 void *base_addr; /* base address of this chunk */
109
105 int map_used; /* # of map entries used before the sentry */ 110 int map_used; /* # of map entries used before the sentry */
106 int map_alloc; /* # of map entries allocated */ 111 int map_alloc; /* # of map entries allocated */
107 int *map; /* allocation map */ 112 int *map; /* allocation map */
113 struct work_struct map_extend_work;/* async ->map[] extension */
114
108 void *data; /* chunk data */ 115 void *data; /* chunk data */
109 int first_free; /* no free below this */ 116 int first_free; /* no free below this */
110 bool immutable; /* no [de]population allowed */ 117 bool immutable; /* no [de]population allowed */
118 int nr_populated; /* # of populated pages */
111 unsigned long populated[]; /* populated bitmap */ 119 unsigned long populated[]; /* populated bitmap */
112}; 120};
113 121
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk;
151static struct pcpu_chunk *pcpu_reserved_chunk; 159static struct pcpu_chunk *pcpu_reserved_chunk;
152static int pcpu_reserved_chunk_limit; 160static int pcpu_reserved_chunk_limit;
153 161
162static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
163static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
164
165static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
166
154/* 167/*
155 * Synchronization rules. 168 * The number of empty populated pages, protected by pcpu_lock. The
156 * 169 * reserved chunk doesn't contribute to the count.
157 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
158 * protects allocation/reclaim paths, chunks, populated bitmap and
159 * vmalloc mapping. The latter is a spinlock and protects the index
160 * data structures - chunk slots, chunks and area maps in chunks.
161 *
162 * During allocation, pcpu_alloc_mutex is kept locked all the time and
163 * pcpu_lock is grabbed and released as necessary. All actual memory
164 * allocations are done using GFP_KERNEL with pcpu_lock released. In
165 * general, percpu memory can't be allocated with irq off but
166 * irqsave/restore are still used in alloc path so that it can be used
167 * from early init path - sched_init() specifically.
168 *
169 * Free path accesses and alters only the index data structures, so it
170 * can be safely called from atomic context. When memory needs to be
171 * returned to the system, free path schedules reclaim_work which
172 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
173 * reclaimed, release both locks and frees the chunks. Note that it's
174 * necessary to grab both locks to remove a chunk from circulation as
175 * allocation path might be referencing the chunk with only
176 * pcpu_alloc_mutex locked.
177 */ 170 */
178static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 171static int pcpu_nr_empty_pop_pages;
179static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
180 172
181static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 173/*
174 * Balance work is used to populate or destroy chunks asynchronously. We
175 * try to keep the number of populated free pages between
176 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
177 * empty chunk.
178 */
179static void pcpu_balance_workfn(struct work_struct *work);
180static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181static bool pcpu_async_enabled __read_mostly;
182static bool pcpu_atomic_alloc_failed;
182 183
183/* reclaim work to release fully free chunks, scheduled from free path */ 184static void pcpu_schedule_balance_work(void)
184static void pcpu_reclaim(struct work_struct *work); 185{
185static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 186 if (pcpu_async_enabled)
187 schedule_work(&pcpu_balance_work);
188}
186 189
187static bool pcpu_addr_in_first_chunk(void *addr) 190static bool pcpu_addr_in_first_chunk(void *addr)
188{ 191{
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size)
315} 318}
316 319
317/** 320/**
321 * pcpu_count_occupied_pages - count the number of pages an area occupies
322 * @chunk: chunk of interest
323 * @i: index of the area in question
324 *
325 * Count the number of pages chunk's @i'th area occupies. When the area's
326 * start and/or end address isn't aligned to page boundary, the straddled
327 * page is included in the count iff the rest of the page is free.
328 */
329static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
330{
331 int off = chunk->map[i] & ~1;
332 int end = chunk->map[i + 1] & ~1;
333
334 if (!PAGE_ALIGNED(off) && i > 0) {
335 int prev = chunk->map[i - 1];
336
337 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
338 off = round_down(off, PAGE_SIZE);
339 }
340
341 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
342 int next = chunk->map[i + 1];
343 int nend = chunk->map[i + 2] & ~1;
344
345 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
346 end = round_up(end, PAGE_SIZE);
347 }
348
349 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
350}
351
352/**
318 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 353 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
319 * @chunk: chunk of interest 354 * @chunk: chunk of interest
320 * @oslot: the previous slot it was on 355 * @oslot: the previous slot it was on
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
342/** 377/**
343 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 378 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
344 * @chunk: chunk of interest 379 * @chunk: chunk of interest
380 * @is_atomic: the allocation context
345 * 381 *
346 * Determine whether area map of @chunk needs to be extended to 382 * Determine whether area map of @chunk needs to be extended. If
347 * accommodate a new allocation. 383 * @is_atomic, only the amount necessary for a new allocation is
384 * considered; however, async extension is scheduled if the left amount is
385 * low. If !@is_atomic, it aims for more empty space. Combined, this
386 * ensures that the map is likely to have enough available space to
387 * accomodate atomic allocations which can't extend maps directly.
348 * 388 *
349 * CONTEXT: 389 * CONTEXT:
350 * pcpu_lock. 390 * pcpu_lock.
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
353 * New target map allocation length if extension is necessary, 0 393 * New target map allocation length if extension is necessary, 0
354 * otherwise. 394 * otherwise.
355 */ 395 */
356static int pcpu_need_to_extend(struct pcpu_chunk *chunk) 396static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
357{ 397{
358 int new_alloc; 398 int margin, new_alloc;
399
400 if (is_atomic) {
401 margin = 3;
402
403 if (chunk->map_alloc <
404 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405 pcpu_async_enabled)
406 schedule_work(&chunk->map_extend_work);
407 } else {
408 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
409 }
359 410
360 if (chunk->map_alloc >= chunk->map_used + 3) 411 if (chunk->map_alloc >= chunk->map_used + margin)
361 return 0; 412 return 0;
362 413
363 new_alloc = PCPU_DFL_MAP_ALLOC; 414 new_alloc = PCPU_DFL_MAP_ALLOC;
364 while (new_alloc < chunk->map_used + 3) 415 while (new_alloc < chunk->map_used + margin)
365 new_alloc *= 2; 416 new_alloc *= 2;
366 417
367 return new_alloc; 418 return new_alloc;
@@ -418,11 +469,76 @@ out_unlock:
418 return 0; 469 return 0;
419} 470}
420 471
472static void pcpu_map_extend_workfn(struct work_struct *work)
473{
474 struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
475 map_extend_work);
476 int new_alloc;
477
478 spin_lock_irq(&pcpu_lock);
479 new_alloc = pcpu_need_to_extend(chunk, false);
480 spin_unlock_irq(&pcpu_lock);
481
482 if (new_alloc)
483 pcpu_extend_area_map(chunk, new_alloc);
484}
485
486/**
487 * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
488 * @chunk: chunk the candidate area belongs to
489 * @off: the offset to the start of the candidate area
490 * @this_size: the size of the candidate area
491 * @size: the size of the target allocation
492 * @align: the alignment of the target allocation
493 * @pop_only: only allocate from already populated region
494 *
495 * We're trying to allocate @size bytes aligned at @align. @chunk's area
496 * at @off sized @this_size is a candidate. This function determines
497 * whether the target allocation fits in the candidate area and returns the
498 * number of bytes to pad after @off. If the target area doesn't fit, -1
499 * is returned.
500 *
501 * If @pop_only is %true, this function only considers the already
502 * populated part of the candidate area.
503 */
504static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
505 int size, int align, bool pop_only)
506{
507 int cand_off = off;
508
509 while (true) {
510 int head = ALIGN(cand_off, align) - off;
511 int page_start, page_end, rs, re;
512
513 if (this_size < head + size)
514 return -1;
515
516 if (!pop_only)
517 return head;
518
519 /*
520 * If the first unpopulated page is beyond the end of the
521 * allocation, the whole allocation is populated;
522 * otherwise, retry from the end of the unpopulated area.
523 */
524 page_start = PFN_DOWN(head + off);
525 page_end = PFN_UP(head + off + size);
526
527 rs = page_start;
528 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
529 if (rs >= page_end)
530 return head;
531 cand_off = re * PAGE_SIZE;
532 }
533}
534
421/** 535/**
422 * pcpu_alloc_area - allocate area from a pcpu_chunk 536 * pcpu_alloc_area - allocate area from a pcpu_chunk
423 * @chunk: chunk of interest 537 * @chunk: chunk of interest
424 * @size: wanted size in bytes 538 * @size: wanted size in bytes
425 * @align: wanted align 539 * @align: wanted align
540 * @pop_only: allocate only from the populated area
541 * @occ_pages_p: out param for the number of pages the area occupies
426 * 542 *
427 * Try to allocate @size bytes area aligned at @align from @chunk. 543 * Try to allocate @size bytes area aligned at @align from @chunk.
428 * Note that this function only allocates the offset. It doesn't 544 * Note that this function only allocates the offset. It doesn't
@@ -437,7 +553,8 @@ out_unlock:
437 * Allocated offset in @chunk on success, -1 if no matching area is 553 * Allocated offset in @chunk on success, -1 if no matching area is
438 * found. 554 * found.
439 */ 555 */
440static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 556static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
557 bool pop_only, int *occ_pages_p)
441{ 558{
442 int oslot = pcpu_chunk_slot(chunk); 559 int oslot = pcpu_chunk_slot(chunk);
443 int max_contig = 0; 560 int max_contig = 0;
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
453 if (off & 1) 570 if (off & 1)
454 continue; 571 continue;
455 572
456 /* extra for alignment requirement */
457 head = ALIGN(off, align) - off;
458
459 this_size = (p[1] & ~1) - off; 573 this_size = (p[1] & ~1) - off;
460 if (this_size < head + size) { 574
575 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
576 pop_only);
577 if (head < 0) {
461 if (!seen_free) { 578 if (!seen_free) {
462 chunk->first_free = i; 579 chunk->first_free = i;
463 seen_free = true; 580 seen_free = true;
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
526 chunk->free_size -= size; 643 chunk->free_size -= size;
527 *p |= 1; 644 *p |= 1;
528 645
646 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
529 pcpu_chunk_relocate(chunk, oslot); 647 pcpu_chunk_relocate(chunk, oslot);
530 return off; 648 return off;
531 } 649 }
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
541 * pcpu_free_area - free area to a pcpu_chunk 659 * pcpu_free_area - free area to a pcpu_chunk
542 * @chunk: chunk of interest 660 * @chunk: chunk of interest
543 * @freeme: offset of area to free 661 * @freeme: offset of area to free
662 * @occ_pages_p: out param for the number of pages the area occupies
544 * 663 *
545 * Free area starting from @freeme to @chunk. Note that this function 664 * Free area starting from @freeme to @chunk. Note that this function
546 * only modifies the allocation map. It doesn't depopulate or unmap 665 * only modifies the allocation map. It doesn't depopulate or unmap
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
549 * CONTEXT: 668 * CONTEXT:
550 * pcpu_lock. 669 * pcpu_lock.
551 */ 670 */
552static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 671static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
672 int *occ_pages_p)
553{ 673{
554 int oslot = pcpu_chunk_slot(chunk); 674 int oslot = pcpu_chunk_slot(chunk);
555 int off = 0; 675 int off = 0;
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
580 *p = off &= ~1; 700 *p = off &= ~1;
581 chunk->free_size += (p[1] & ~1) - off; 701 chunk->free_size += (p[1] & ~1) - off;
582 702
703 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
704
583 /* merge with next? */ 705 /* merge with next? */
584 if (!(p[1] & 1)) 706 if (!(p[1] & 1))
585 to_free++; 707 to_free++;
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
620 chunk->map_used = 1; 742 chunk->map_used = 1;
621 743
622 INIT_LIST_HEAD(&chunk->list); 744 INIT_LIST_HEAD(&chunk->list);
745 INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
623 chunk->free_size = pcpu_unit_size; 746 chunk->free_size = pcpu_unit_size;
624 chunk->contig_hint = pcpu_unit_size; 747 chunk->contig_hint = pcpu_unit_size;
625 748
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
634 pcpu_mem_free(chunk, pcpu_chunk_struct_size); 757 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
635} 758}
636 759
760/**
761 * pcpu_chunk_populated - post-population bookkeeping
762 * @chunk: pcpu_chunk which got populated
763 * @page_start: the start page
764 * @page_end: the end page
765 *
766 * Pages in [@page_start,@page_end) have been populated to @chunk. Update
767 * the bookkeeping information accordingly. Must be called after each
768 * successful population.
769 */
770static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
771 int page_start, int page_end)
772{
773 int nr = page_end - page_start;
774
775 lockdep_assert_held(&pcpu_lock);
776
777 bitmap_set(chunk->populated, page_start, nr);
778 chunk->nr_populated += nr;
779 pcpu_nr_empty_pop_pages += nr;
780}
781
782/**
783 * pcpu_chunk_depopulated - post-depopulation bookkeeping
784 * @chunk: pcpu_chunk which got depopulated
785 * @page_start: the start page
786 * @page_end: the end page
787 *
788 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
789 * Update the bookkeeping information accordingly. Must be called after
790 * each successful depopulation.
791 */
792static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
793 int page_start, int page_end)
794{
795 int nr = page_end - page_start;
796
797 lockdep_assert_held(&pcpu_lock);
798
799 bitmap_clear(chunk->populated, page_start, nr);
800 chunk->nr_populated -= nr;
801 pcpu_nr_empty_pop_pages -= nr;
802}
803
637/* 804/*
638 * Chunk management implementation. 805 * Chunk management implementation.
639 * 806 *
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
695 * @size: size of area to allocate in bytes 862 * @size: size of area to allocate in bytes
696 * @align: alignment of area (max PAGE_SIZE) 863 * @align: alignment of area (max PAGE_SIZE)
697 * @reserved: allocate from the reserved chunk if available 864 * @reserved: allocate from the reserved chunk if available
865 * @gfp: allocation flags
698 * 866 *
699 * Allocate percpu area of @size bytes aligned at @align. 867 * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
700 * 868 * contain %GFP_KERNEL, the allocation is atomic.
701 * CONTEXT:
702 * Does GFP_KERNEL allocation.
703 * 869 *
704 * RETURNS: 870 * RETURNS:
705 * Percpu pointer to the allocated area on success, NULL on failure. 871 * Percpu pointer to the allocated area on success, NULL on failure.
706 */ 872 */
707static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) 873static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
874 gfp_t gfp)
708{ 875{
709 static int warn_limit = 10; 876 static int warn_limit = 10;
710 struct pcpu_chunk *chunk; 877 struct pcpu_chunk *chunk;
711 const char *err; 878 const char *err;
712 int slot, off, new_alloc; 879 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
880 int occ_pages = 0;
881 int slot, off, new_alloc, cpu, ret;
713 unsigned long flags; 882 unsigned long flags;
714 void __percpu *ptr; 883 void __percpu *ptr;
715 884
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
728 return NULL; 897 return NULL;
729 } 898 }
730 899
731 mutex_lock(&pcpu_alloc_mutex);
732 spin_lock_irqsave(&pcpu_lock, flags); 900 spin_lock_irqsave(&pcpu_lock, flags);
733 901
734 /* serve reserved allocations from the reserved chunk if available */ 902 /* serve reserved allocations from the reserved chunk if available */
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
740 goto fail_unlock; 908 goto fail_unlock;
741 } 909 }
742 910
743 while ((new_alloc = pcpu_need_to_extend(chunk))) { 911 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
744 spin_unlock_irqrestore(&pcpu_lock, flags); 912 spin_unlock_irqrestore(&pcpu_lock, flags);
745 if (pcpu_extend_area_map(chunk, new_alloc) < 0) { 913 if (is_atomic ||
914 pcpu_extend_area_map(chunk, new_alloc) < 0) {
746 err = "failed to extend area map of reserved chunk"; 915 err = "failed to extend area map of reserved chunk";
747 goto fail_unlock_mutex; 916 goto fail;
748 } 917 }
749 spin_lock_irqsave(&pcpu_lock, flags); 918 spin_lock_irqsave(&pcpu_lock, flags);
750 } 919 }
751 920
752 off = pcpu_alloc_area(chunk, size, align); 921 off = pcpu_alloc_area(chunk, size, align, is_atomic,
922 &occ_pages);
753 if (off >= 0) 923 if (off >= 0)
754 goto area_found; 924 goto area_found;
755 925
@@ -764,13 +934,15 @@ restart:
764 if (size > chunk->contig_hint) 934 if (size > chunk->contig_hint)
765 continue; 935 continue;
766 936
767 new_alloc = pcpu_need_to_extend(chunk); 937 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
768 if (new_alloc) { 938 if (new_alloc) {
939 if (is_atomic)
940 continue;
769 spin_unlock_irqrestore(&pcpu_lock, flags); 941 spin_unlock_irqrestore(&pcpu_lock, flags);
770 if (pcpu_extend_area_map(chunk, 942 if (pcpu_extend_area_map(chunk,
771 new_alloc) < 0) { 943 new_alloc) < 0) {
772 err = "failed to extend area map"; 944 err = "failed to extend area map";
773 goto fail_unlock_mutex; 945 goto fail;
774 } 946 }
775 spin_lock_irqsave(&pcpu_lock, flags); 947 spin_lock_irqsave(&pcpu_lock, flags);
776 /* 948 /*
@@ -780,74 +952,134 @@ restart:
780 goto restart; 952 goto restart;
781 } 953 }
782 954
783 off = pcpu_alloc_area(chunk, size, align); 955 off = pcpu_alloc_area(chunk, size, align, is_atomic,
956 &occ_pages);
784 if (off >= 0) 957 if (off >= 0)
785 goto area_found; 958 goto area_found;
786 } 959 }
787 } 960 }
788 961
789 /* hmmm... no space left, create a new chunk */
790 spin_unlock_irqrestore(&pcpu_lock, flags); 962 spin_unlock_irqrestore(&pcpu_lock, flags);
791 963
792 chunk = pcpu_create_chunk(); 964 /*
793 if (!chunk) { 965 * No space left. Create a new chunk. We don't want multiple
794 err = "failed to allocate new chunk"; 966 * tasks to create chunks simultaneously. Serialize and create iff
795 goto fail_unlock_mutex; 967 * there's still no empty chunk after grabbing the mutex.
968 */
969 if (is_atomic)
970 goto fail;
971
972 mutex_lock(&pcpu_alloc_mutex);
973
974 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
975 chunk = pcpu_create_chunk();
976 if (!chunk) {
977 mutex_unlock(&pcpu_alloc_mutex);
978 err = "failed to allocate new chunk";
979 goto fail;
980 }
981
982 spin_lock_irqsave(&pcpu_lock, flags);
983 pcpu_chunk_relocate(chunk, -1);
984 } else {
985 spin_lock_irqsave(&pcpu_lock, flags);
796 } 986 }
797 987
798 spin_lock_irqsave(&pcpu_lock, flags); 988 mutex_unlock(&pcpu_alloc_mutex);
799 pcpu_chunk_relocate(chunk, -1);
800 goto restart; 989 goto restart;
801 990
802area_found: 991area_found:
803 spin_unlock_irqrestore(&pcpu_lock, flags); 992 spin_unlock_irqrestore(&pcpu_lock, flags);
804 993
805 /* populate, map and clear the area */ 994 /* populate if not all pages are already there */
806 if (pcpu_populate_chunk(chunk, off, size)) { 995 if (!is_atomic) {
807 spin_lock_irqsave(&pcpu_lock, flags); 996 int page_start, page_end, rs, re;
808 pcpu_free_area(chunk, off); 997
809 err = "failed to populate"; 998 mutex_lock(&pcpu_alloc_mutex);
810 goto fail_unlock; 999
1000 page_start = PFN_DOWN(off);
1001 page_end = PFN_UP(off + size);
1002
1003 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1004 WARN_ON(chunk->immutable);
1005
1006 ret = pcpu_populate_chunk(chunk, rs, re);
1007
1008 spin_lock_irqsave(&pcpu_lock, flags);
1009 if (ret) {
1010 mutex_unlock(&pcpu_alloc_mutex);
1011 pcpu_free_area(chunk, off, &occ_pages);
1012 err = "failed to populate";
1013 goto fail_unlock;
1014 }
1015 pcpu_chunk_populated(chunk, rs, re);
1016 spin_unlock_irqrestore(&pcpu_lock, flags);
1017 }
1018
1019 mutex_unlock(&pcpu_alloc_mutex);
811 } 1020 }
812 1021
813 mutex_unlock(&pcpu_alloc_mutex); 1022 if (chunk != pcpu_reserved_chunk)
1023 pcpu_nr_empty_pop_pages -= occ_pages;
1024
1025 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1026 pcpu_schedule_balance_work();
1027
1028 /* clear the areas and return address relative to base address */
1029 for_each_possible_cpu(cpu)
1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
814 1031
815 /* return address relative to base address */
816 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
817 kmemleak_alloc_percpu(ptr, size); 1033 kmemleak_alloc_percpu(ptr, size);
818 return ptr; 1034 return ptr;
819 1035
820fail_unlock: 1036fail_unlock:
821 spin_unlock_irqrestore(&pcpu_lock, flags); 1037 spin_unlock_irqrestore(&pcpu_lock, flags);
822fail_unlock_mutex: 1038fail:
823 mutex_unlock(&pcpu_alloc_mutex); 1039 if (!is_atomic && warn_limit) {
824 if (warn_limit) { 1040 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
825 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " 1041 size, align, is_atomic, err);
826 "%s\n", size, align, err);
827 dump_stack(); 1042 dump_stack();
828 if (!--warn_limit) 1043 if (!--warn_limit)
829 pr_info("PERCPU: limit reached, disable warning\n"); 1044 pr_info("PERCPU: limit reached, disable warning\n");
830 } 1045 }
1046 if (is_atomic) {
1047 /* see the flag handling in pcpu_blance_workfn() */
1048 pcpu_atomic_alloc_failed = true;
1049 pcpu_schedule_balance_work();
1050 }
831 return NULL; 1051 return NULL;
832} 1052}
833 1053
834/** 1054/**
835 * __alloc_percpu - allocate dynamic percpu area 1055 * __alloc_percpu_gfp - allocate dynamic percpu area
836 * @size: size of area to allocate in bytes 1056 * @size: size of area to allocate in bytes
837 * @align: alignment of area (max PAGE_SIZE) 1057 * @align: alignment of area (max PAGE_SIZE)
1058 * @gfp: allocation flags
838 * 1059 *
839 * Allocate zero-filled percpu area of @size bytes aligned at @align. 1060 * Allocate zero-filled percpu area of @size bytes aligned at @align. If
840 * Might sleep. Might trigger writeouts. 1061 * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
841 * 1062 * be called from any context but is a lot more likely to fail.
842 * CONTEXT:
843 * Does GFP_KERNEL allocation.
844 * 1063 *
845 * RETURNS: 1064 * RETURNS:
846 * Percpu pointer to the allocated area on success, NULL on failure. 1065 * Percpu pointer to the allocated area on success, NULL on failure.
847 */ 1066 */
1067void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1068{
1069 return pcpu_alloc(size, align, false, gfp);
1070}
1071EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1072
1073/**
1074 * __alloc_percpu - allocate dynamic percpu area
1075 * @size: size of area to allocate in bytes
1076 * @align: alignment of area (max PAGE_SIZE)
1077 *
1078 * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
1079 */
848void __percpu *__alloc_percpu(size_t size, size_t align) 1080void __percpu *__alloc_percpu(size_t size, size_t align)
849{ 1081{
850 return pcpu_alloc(size, align, false); 1082 return pcpu_alloc(size, align, false, GFP_KERNEL);
851} 1083}
852EXPORT_SYMBOL_GPL(__alloc_percpu); 1084EXPORT_SYMBOL_GPL(__alloc_percpu);
853 1085
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
869 */ 1101 */
870void __percpu *__alloc_reserved_percpu(size_t size, size_t align) 1102void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
871{ 1103{
872 return pcpu_alloc(size, align, true); 1104 return pcpu_alloc(size, align, true, GFP_KERNEL);
873} 1105}
874 1106
875/** 1107/**
876 * pcpu_reclaim - reclaim fully free chunks, workqueue function 1108 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
877 * @work: unused 1109 * @work: unused
878 * 1110 *
879 * Reclaim all fully free chunks except for the first one. 1111 * Reclaim all fully free chunks except for the first one.
880 *
881 * CONTEXT:
882 * workqueue context.
883 */ 1112 */
884static void pcpu_reclaim(struct work_struct *work) 1113static void pcpu_balance_workfn(struct work_struct *work)
885{ 1114{
886 LIST_HEAD(todo); 1115 LIST_HEAD(to_free);
887 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 1116 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
888 struct pcpu_chunk *chunk, *next; 1117 struct pcpu_chunk *chunk, *next;
1118 int slot, nr_to_pop, ret;
889 1119
1120 /*
1121 * There's no reason to keep around multiple unused chunks and VM
1122 * areas can be scarce. Destroy all free chunks except for one.
1123 */
890 mutex_lock(&pcpu_alloc_mutex); 1124 mutex_lock(&pcpu_alloc_mutex);
891 spin_lock_irq(&pcpu_lock); 1125 spin_lock_irq(&pcpu_lock);
892 1126
893 list_for_each_entry_safe(chunk, next, head, list) { 1127 list_for_each_entry_safe(chunk, next, free_head, list) {
894 WARN_ON(chunk->immutable); 1128 WARN_ON(chunk->immutable);
895 1129
896 /* spare the first one */ 1130 /* spare the first one */
897 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 1131 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
898 continue; 1132 continue;
899 1133
900 list_move(&chunk->list, &todo); 1134 list_move(&chunk->list, &to_free);
901 } 1135 }
902 1136
903 spin_unlock_irq(&pcpu_lock); 1137 spin_unlock_irq(&pcpu_lock);
904 1138
905 list_for_each_entry_safe(chunk, next, &todo, list) { 1139 list_for_each_entry_safe(chunk, next, &to_free, list) {
906 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 1140 int rs, re;
1141
1142 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1143 pcpu_depopulate_chunk(chunk, rs, re);
1144 spin_lock_irq(&pcpu_lock);
1145 pcpu_chunk_depopulated(chunk, rs, re);
1146 spin_unlock_irq(&pcpu_lock);
1147 }
907 pcpu_destroy_chunk(chunk); 1148 pcpu_destroy_chunk(chunk);
908 } 1149 }
909 1150
1151 /*
1152 * Ensure there are certain number of free populated pages for
1153 * atomic allocs. Fill up from the most packed so that atomic
1154 * allocs don't increase fragmentation. If atomic allocation
1155 * failed previously, always populate the maximum amount. This
1156 * should prevent atomic allocs larger than PAGE_SIZE from keeping
1157 * failing indefinitely; however, large atomic allocs are not
1158 * something we support properly and can be highly unreliable and
1159 * inefficient.
1160 */
1161retry_pop:
1162 if (pcpu_atomic_alloc_failed) {
1163 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1164 /* best effort anyway, don't worry about synchronization */
1165 pcpu_atomic_alloc_failed = false;
1166 } else {
1167 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1168 pcpu_nr_empty_pop_pages,
1169 0, PCPU_EMPTY_POP_PAGES_HIGH);
1170 }
1171
1172 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1173 int nr_unpop = 0, rs, re;
1174
1175 if (!nr_to_pop)
1176 break;
1177
1178 spin_lock_irq(&pcpu_lock);
1179 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1180 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1181 if (nr_unpop)
1182 break;
1183 }
1184 spin_unlock_irq(&pcpu_lock);
1185
1186 if (!nr_unpop)
1187 continue;
1188
1189 /* @chunk can't go away while pcpu_alloc_mutex is held */
1190 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1191 int nr = min(re - rs, nr_to_pop);
1192
1193 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1194 if (!ret) {
1195 nr_to_pop -= nr;
1196 spin_lock_irq(&pcpu_lock);
1197 pcpu_chunk_populated(chunk, rs, rs + nr);
1198 spin_unlock_irq(&pcpu_lock);
1199 } else {
1200 nr_to_pop = 0;
1201 }
1202
1203 if (!nr_to_pop)
1204 break;
1205 }
1206 }
1207
1208 if (nr_to_pop) {
1209 /* ran out of chunks to populate, create a new one and retry */
1210 chunk = pcpu_create_chunk();
1211 if (chunk) {
1212 spin_lock_irq(&pcpu_lock);
1213 pcpu_chunk_relocate(chunk, -1);
1214 spin_unlock_irq(&pcpu_lock);
1215 goto retry_pop;
1216 }
1217 }
1218
910 mutex_unlock(&pcpu_alloc_mutex); 1219 mutex_unlock(&pcpu_alloc_mutex);
911} 1220}
912 1221
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr)
924 void *addr; 1233 void *addr;
925 struct pcpu_chunk *chunk; 1234 struct pcpu_chunk *chunk;
926 unsigned long flags; 1235 unsigned long flags;
927 int off; 1236 int off, occ_pages;
928 1237
929 if (!ptr) 1238 if (!ptr)
930 return; 1239 return;
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr)
938 chunk = pcpu_chunk_addr_search(addr); 1247 chunk = pcpu_chunk_addr_search(addr);
939 off = addr - chunk->base_addr; 1248 off = addr - chunk->base_addr;
940 1249
941 pcpu_free_area(chunk, off); 1250 pcpu_free_area(chunk, off, &occ_pages);
1251
1252 if (chunk != pcpu_reserved_chunk)
1253 pcpu_nr_empty_pop_pages += occ_pages;
942 1254
943 /* if there are more than one fully free chunks, wake up grim reaper */ 1255 /* if there are more than one fully free chunks, wake up grim reaper */
944 if (chunk->free_size == pcpu_unit_size) { 1256 if (chunk->free_size == pcpu_unit_size) {
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr)
946 1258
947 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1259 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
948 if (pos != chunk) { 1260 if (pos != chunk) {
949 schedule_work(&pcpu_reclaim_work); 1261 pcpu_schedule_balance_work();
950 break; 1262 break;
951 } 1263 }
952 } 1264 }
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1336 */ 1648 */
1337 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1649 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1338 INIT_LIST_HEAD(&schunk->list); 1650 INIT_LIST_HEAD(&schunk->list);
1651 INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
1339 schunk->base_addr = base_addr; 1652 schunk->base_addr = base_addr;
1340 schunk->map = smap; 1653 schunk->map = smap;
1341 schunk->map_alloc = ARRAY_SIZE(smap); 1654 schunk->map_alloc = ARRAY_SIZE(smap);
1342 schunk->immutable = true; 1655 schunk->immutable = true;
1343 bitmap_fill(schunk->populated, pcpu_unit_pages); 1656 bitmap_fill(schunk->populated, pcpu_unit_pages);
1657 schunk->nr_populated = pcpu_unit_pages;
1344 1658
1345 if (ai->reserved_size) { 1659 if (ai->reserved_size) {
1346 schunk->free_size = ai->reserved_size; 1660 schunk->free_size = ai->reserved_size;
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1364 if (dyn_size) { 1678 if (dyn_size) {
1365 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1679 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1366 INIT_LIST_HEAD(&dchunk->list); 1680 INIT_LIST_HEAD(&dchunk->list);
1681 INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
1367 dchunk->base_addr = base_addr; 1682 dchunk->base_addr = base_addr;
1368 dchunk->map = dmap; 1683 dchunk->map = dmap;
1369 dchunk->map_alloc = ARRAY_SIZE(dmap); 1684 dchunk->map_alloc = ARRAY_SIZE(dmap);
1370 dchunk->immutable = true; 1685 dchunk->immutable = true;
1371 bitmap_fill(dchunk->populated, pcpu_unit_pages); 1686 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1687 dchunk->nr_populated = pcpu_unit_pages;
1372 1688
1373 dchunk->contig_hint = dchunk->free_size = dyn_size; 1689 dchunk->contig_hint = dchunk->free_size = dyn_size;
1374 dchunk->map[0] = 1; 1690 dchunk->map[0] = 1;
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1379 1695
1380 /* link the first chunk in */ 1696 /* link the first chunk in */
1381 pcpu_first_chunk = dchunk ?: schunk; 1697 pcpu_first_chunk = dchunk ?: schunk;
1698 pcpu_nr_empty_pop_pages +=
1699 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1382 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1700 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1383 1701
1384 /* we're done */ 1702 /* we're done */
@@ -1965,3 +2283,15 @@ void __init percpu_init_late(void)
1965 spin_unlock_irqrestore(&pcpu_lock, flags); 2283 spin_unlock_irqrestore(&pcpu_lock, flags);
1966 } 2284 }
1967} 2285}
2286
2287/*
2288 * Percpu allocator is initialized early during boot when neither slab or
2289 * workqueue is available. Plug async management until everything is up
2290 * and running.
2291 */
2292static int __init percpu_enable_async(void)
2293{
2294 pcpu_async_enabled = true;
2295 return 0;
2296}
2297subsys_initcall(percpu_enable_async);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a8b919925934..dfb79e028ecb 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
195 pmd_t entry = *pmdp; 195 pmd_t entry = *pmdp;
196 if (pmd_numa(entry)) 196 if (pmd_numa(entry))
197 entry = pmd_mknonnuma(entry); 197 entry = pmd_mknonnuma(entry);
198 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); 198 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
199 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 199 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
200} 200}
201#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 201#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e8491c504f8..116a5053415b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
527 unsigned long address = __vma_address(page, vma); 527 unsigned long address = __vma_address(page, vma);
528 528
529 /* page should be within @vma mapping range */ 529 /* page should be within @vma mapping range */
530 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 530 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
531 531
532 return address; 532 return address;
533} 533}
@@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page,
897 struct anon_vma *anon_vma = vma->anon_vma; 897 struct anon_vma *anon_vma = vma->anon_vma;
898 898
899 VM_BUG_ON_PAGE(!PageLocked(page), page); 899 VM_BUG_ON_PAGE(!PageLocked(page), page);
900 VM_BUG_ON(!anon_vma); 900 VM_BUG_ON_VMA(!anon_vma, vma);
901 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); 901 VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
902 902
903 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 903 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page,
1024void page_add_new_anon_rmap(struct page *page, 1024void page_add_new_anon_rmap(struct page *page,
1025 struct vm_area_struct *vma, unsigned long address) 1025 struct vm_area_struct *vma, unsigned long address)
1026{ 1026{
1027 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1027 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
1028 SetPageSwapBacked(page); 1028 SetPageSwapBacked(page);
1029 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1029 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
1030 if (PageTransHuge(page)) 1030 if (PageTransHuge(page))
@@ -1355,7 +1355,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1355 continue; /* don't unmap */ 1355 continue; /* don't unmap */
1356 } 1356 }
1357 1357
1358 if (ptep_clear_flush_young_notify(vma, address, pte)) 1358 /*
1359 * No need for _notify because we're within an
1360 * mmu_notifier_invalidate_range_ {start|end} scope.
1361 */
1362 if (ptep_clear_flush_young(vma, address, pte))
1359 continue; 1363 continue;
1360 1364
1361 /* Nuke the page table entry. */ 1365 /* Nuke the page table entry. */
@@ -1666,7 +1670,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1666 * structure at mapping cannot be freed and reused yet, 1670 * structure at mapping cannot be freed and reused yet,
1667 * so we can safely take mapping->i_mmap_mutex. 1671 * so we can safely take mapping->i_mmap_mutex.
1668 */ 1672 */
1669 VM_BUG_ON(!PageLocked(page)); 1673 VM_BUG_ON_PAGE(!PageLocked(page), page);
1670 1674
1671 if (!mapping) 1675 if (!mapping)
1672 return ret; 1676 return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0e5fb225007c..cd6fc7590e54 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2367,8 +2367,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
2367 2367
2368 if (new_dentry->d_inode) { 2368 if (new_dentry->d_inode) {
2369 (void) shmem_unlink(new_dir, new_dentry); 2369 (void) shmem_unlink(new_dir, new_dentry);
2370 if (they_are_dirs) 2370 if (they_are_dirs) {
2371 drop_nlink(new_dentry->d_inode);
2371 drop_nlink(old_dir); 2372 drop_nlink(old_dir);
2373 }
2372 } else if (they_are_dirs) { 2374 } else if (they_are_dirs) {
2373 drop_nlink(old_dir); 2375 drop_nlink(old_dir);
2374 inc_nlink(new_dir); 2376 inc_nlink(new_dir);
@@ -2993,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2993#endif 2995#endif
2994 2996
2995 spin_lock_init(&sbinfo->stat_lock); 2997 spin_lock_init(&sbinfo->stat_lock);
2996 if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2998 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
2997 goto failed; 2999 goto failed;
2998 sbinfo->free_inodes = sbinfo->max_inodes; 3000 sbinfo->free_inodes = sbinfo->max_inodes;
2999 3001
@@ -3075,7 +3077,9 @@ static const struct address_space_operations shmem_aops = {
3075 .write_begin = shmem_write_begin, 3077 .write_begin = shmem_write_begin,
3076 .write_end = shmem_write_end, 3078 .write_end = shmem_write_end,
3077#endif 3079#endif
3080#ifdef CONFIG_MIGRATION
3078 .migratepage = migrate_page, 3081 .migratepage = migrate_page,
3082#endif
3079 .error_remove_page = generic_error_remove_page, 3083 .error_remove_page = generic_error_remove_page,
3080}; 3084};
3081 3085
diff --git a/mm/slab.c b/mm/slab.c
index a467b308c682..154aac8411c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -237,11 +237,10 @@ struct arraycache_init {
237/* 237/*
238 * Need this for bootstrapping a per node allocator. 238 * Need this for bootstrapping a per node allocator.
239 */ 239 */
240#define NUM_INIT_LISTS (3 * MAX_NUMNODES) 240#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
241static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; 241static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
242#define CACHE_CACHE 0 242#define CACHE_CACHE 0
243#define SIZE_AC MAX_NUMNODES 243#define SIZE_NODE (MAX_NUMNODES)
244#define SIZE_NODE (2 * MAX_NUMNODES)
245 244
246static int drain_freelist(struct kmem_cache *cache, 245static int drain_freelist(struct kmem_cache *cache,
247 struct kmem_cache_node *n, int tofree); 246 struct kmem_cache_node *n, int tofree);
@@ -253,7 +252,6 @@ static void cache_reap(struct work_struct *unused);
253 252
254static int slab_early_init = 1; 253static int slab_early_init = 1;
255 254
256#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
257#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) 255#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
258 256
259static void kmem_cache_node_init(struct kmem_cache_node *parent) 257static void kmem_cache_node_init(struct kmem_cache_node *parent)
@@ -458,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
458 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 456 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
459} 457}
460 458
461static struct arraycache_init initarray_generic =
462 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
463
464/* internal cache of cache description objs */ 459/* internal cache of cache description objs */
465static struct kmem_cache kmem_cache_boot = { 460static struct kmem_cache kmem_cache_boot = {
466 .batchcount = 1, 461 .batchcount = 1,
@@ -476,7 +471,7 @@ static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
476 471
477static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 472static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
478{ 473{
479 return cachep->array[smp_processor_id()]; 474 return this_cpu_ptr(cachep->cpu_cache);
480} 475}
481 476
482static size_t calculate_freelist_size(int nr_objs, size_t align) 477static size_t calculate_freelist_size(int nr_objs, size_t align)
@@ -785,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep,
785 return objp; 780 return objp;
786} 781}
787 782
788static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, 783static noinline void *__ac_put_obj(struct kmem_cache *cachep,
789 void *objp) 784 struct array_cache *ac, void *objp)
790{ 785{
791 if (unlikely(pfmemalloc_active)) { 786 if (unlikely(pfmemalloc_active)) {
792 /* Some pfmemalloc slabs exist, check if this is one */ 787 /* Some pfmemalloc slabs exist, check if this is one */
@@ -984,46 +979,50 @@ static void drain_alien_cache(struct kmem_cache *cachep,
984 } 979 }
985} 980}
986 981
987static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 982static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
983 int node, int page_node)
988{ 984{
989 int nodeid = page_to_nid(virt_to_page(objp));
990 struct kmem_cache_node *n; 985 struct kmem_cache_node *n;
991 struct alien_cache *alien = NULL; 986 struct alien_cache *alien = NULL;
992 struct array_cache *ac; 987 struct array_cache *ac;
993 int node;
994 LIST_HEAD(list); 988 LIST_HEAD(list);
995 989
996 node = numa_mem_id();
997
998 /*
999 * Make sure we are not freeing a object from another node to the array
1000 * cache on this cpu.
1001 */
1002 if (likely(nodeid == node))
1003 return 0;
1004
1005 n = get_node(cachep, node); 990 n = get_node(cachep, node);
1006 STATS_INC_NODEFREES(cachep); 991 STATS_INC_NODEFREES(cachep);
1007 if (n->alien && n->alien[nodeid]) { 992 if (n->alien && n->alien[page_node]) {
1008 alien = n->alien[nodeid]; 993 alien = n->alien[page_node];
1009 ac = &alien->ac; 994 ac = &alien->ac;
1010 spin_lock(&alien->lock); 995 spin_lock(&alien->lock);
1011 if (unlikely(ac->avail == ac->limit)) { 996 if (unlikely(ac->avail == ac->limit)) {
1012 STATS_INC_ACOVERFLOW(cachep); 997 STATS_INC_ACOVERFLOW(cachep);
1013 __drain_alien_cache(cachep, ac, nodeid, &list); 998 __drain_alien_cache(cachep, ac, page_node, &list);
1014 } 999 }
1015 ac_put_obj(cachep, ac, objp); 1000 ac_put_obj(cachep, ac, objp);
1016 spin_unlock(&alien->lock); 1001 spin_unlock(&alien->lock);
1017 slabs_destroy(cachep, &list); 1002 slabs_destroy(cachep, &list);
1018 } else { 1003 } else {
1019 n = get_node(cachep, nodeid); 1004 n = get_node(cachep, page_node);
1020 spin_lock(&n->list_lock); 1005 spin_lock(&n->list_lock);
1021 free_block(cachep, &objp, 1, nodeid, &list); 1006 free_block(cachep, &objp, 1, page_node, &list);
1022 spin_unlock(&n->list_lock); 1007 spin_unlock(&n->list_lock);
1023 slabs_destroy(cachep, &list); 1008 slabs_destroy(cachep, &list);
1024 } 1009 }
1025 return 1; 1010 return 1;
1026} 1011}
1012
1013static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1014{
1015 int page_node = page_to_nid(virt_to_page(objp));
1016 int node = numa_mem_id();
1017 /*
1018 * Make sure we are not freeing a object from another node to the array
1019 * cache on this cpu.
1020 */
1021 if (likely(node == page_node))
1022 return 0;
1023
1024 return __cache_free_alien(cachep, objp, node, page_node);
1025}
1027#endif 1026#endif
1028 1027
1029/* 1028/*
@@ -1092,24 +1091,25 @@ static void cpuup_canceled(long cpu)
1092 struct alien_cache **alien; 1091 struct alien_cache **alien;
1093 LIST_HEAD(list); 1092 LIST_HEAD(list);
1094 1093
1095 /* cpu is dead; no one can alloc from it. */
1096 nc = cachep->array[cpu];
1097 cachep->array[cpu] = NULL;
1098 n = get_node(cachep, node); 1094 n = get_node(cachep, node);
1099
1100 if (!n) 1095 if (!n)
1101 goto free_array_cache; 1096 continue;
1102 1097
1103 spin_lock_irq(&n->list_lock); 1098 spin_lock_irq(&n->list_lock);
1104 1099
1105 /* Free limit for this kmem_cache_node */ 1100 /* Free limit for this kmem_cache_node */
1106 n->free_limit -= cachep->batchcount; 1101 n->free_limit -= cachep->batchcount;
1107 if (nc) 1102
1103 /* cpu is dead; no one can alloc from it. */
1104 nc = per_cpu_ptr(cachep->cpu_cache, cpu);
1105 if (nc) {
1108 free_block(cachep, nc->entry, nc->avail, node, &list); 1106 free_block(cachep, nc->entry, nc->avail, node, &list);
1107 nc->avail = 0;
1108 }
1109 1109
1110 if (!cpumask_empty(mask)) { 1110 if (!cpumask_empty(mask)) {
1111 spin_unlock_irq(&n->list_lock); 1111 spin_unlock_irq(&n->list_lock);
1112 goto free_array_cache; 1112 goto free_slab;
1113 } 1113 }
1114 1114
1115 shared = n->shared; 1115 shared = n->shared;
@@ -1129,9 +1129,9 @@ static void cpuup_canceled(long cpu)
1129 drain_alien_cache(cachep, alien); 1129 drain_alien_cache(cachep, alien);
1130 free_alien_cache(alien); 1130 free_alien_cache(alien);
1131 } 1131 }
1132free_array_cache: 1132
1133free_slab:
1133 slabs_destroy(cachep, &list); 1134 slabs_destroy(cachep, &list);
1134 kfree(nc);
1135 } 1135 }
1136 /* 1136 /*
1137 * In the previous loop, all the objects were freed to 1137 * In the previous loop, all the objects were freed to
@@ -1168,32 +1168,23 @@ static int cpuup_prepare(long cpu)
1168 * array caches 1168 * array caches
1169 */ 1169 */
1170 list_for_each_entry(cachep, &slab_caches, list) { 1170 list_for_each_entry(cachep, &slab_caches, list) {
1171 struct array_cache *nc;
1172 struct array_cache *shared = NULL; 1171 struct array_cache *shared = NULL;
1173 struct alien_cache **alien = NULL; 1172 struct alien_cache **alien = NULL;
1174 1173
1175 nc = alloc_arraycache(node, cachep->limit,
1176 cachep->batchcount, GFP_KERNEL);
1177 if (!nc)
1178 goto bad;
1179 if (cachep->shared) { 1174 if (cachep->shared) {
1180 shared = alloc_arraycache(node, 1175 shared = alloc_arraycache(node,
1181 cachep->shared * cachep->batchcount, 1176 cachep->shared * cachep->batchcount,
1182 0xbaadf00d, GFP_KERNEL); 1177 0xbaadf00d, GFP_KERNEL);
1183 if (!shared) { 1178 if (!shared)
1184 kfree(nc);
1185 goto bad; 1179 goto bad;
1186 }
1187 } 1180 }
1188 if (use_alien_caches) { 1181 if (use_alien_caches) {
1189 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); 1182 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1190 if (!alien) { 1183 if (!alien) {
1191 kfree(shared); 1184 kfree(shared);
1192 kfree(nc);
1193 goto bad; 1185 goto bad;
1194 } 1186 }
1195 } 1187 }
1196 cachep->array[cpu] = nc;
1197 n = get_node(cachep, node); 1188 n = get_node(cachep, node);
1198 BUG_ON(!n); 1189 BUG_ON(!n);
1199 1190
@@ -1385,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
1385} 1376}
1386 1377
1387/* 1378/*
1388 * The memory after the last cpu cache pointer is used for the
1389 * the node pointer.
1390 */
1391static void setup_node_pointer(struct kmem_cache *cachep)
1392{
1393 cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
1394}
1395
1396/*
1397 * Initialisation. Called after the page allocator have been initialised and 1379 * Initialisation. Called after the page allocator have been initialised and
1398 * before smp_init(). 1380 * before smp_init().
1399 */ 1381 */
@@ -1404,7 +1386,6 @@ void __init kmem_cache_init(void)
1404 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < 1386 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
1405 sizeof(struct rcu_head)); 1387 sizeof(struct rcu_head));
1406 kmem_cache = &kmem_cache_boot; 1388 kmem_cache = &kmem_cache_boot;
1407 setup_node_pointer(kmem_cache);
1408 1389
1409 if (num_possible_nodes() == 1) 1390 if (num_possible_nodes() == 1)
1410 use_alien_caches = 0; 1391 use_alien_caches = 0;
@@ -1412,8 +1393,6 @@ void __init kmem_cache_init(void)
1412 for (i = 0; i < NUM_INIT_LISTS; i++) 1393 for (i = 0; i < NUM_INIT_LISTS; i++)
1413 kmem_cache_node_init(&init_kmem_cache_node[i]); 1394 kmem_cache_node_init(&init_kmem_cache_node[i]);
1414 1395
1415 set_up_node(kmem_cache, CACHE_CACHE);
1416
1417 /* 1396 /*
1418 * Fragmentation resistance on low memory - only use bigger 1397 * Fragmentation resistance on low memory - only use bigger
1419 * page orders on machines with more than 32MB of memory if 1398 * page orders on machines with more than 32MB of memory if
@@ -1448,49 +1427,22 @@ void __init kmem_cache_init(void)
1448 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1427 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1449 */ 1428 */
1450 create_boot_cache(kmem_cache, "kmem_cache", 1429 create_boot_cache(kmem_cache, "kmem_cache",
1451 offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1430 offsetof(struct kmem_cache, node) +
1452 nr_node_ids * sizeof(struct kmem_cache_node *), 1431 nr_node_ids * sizeof(struct kmem_cache_node *),
1453 SLAB_HWCACHE_ALIGN); 1432 SLAB_HWCACHE_ALIGN);
1454 list_add(&kmem_cache->list, &slab_caches); 1433 list_add(&kmem_cache->list, &slab_caches);
1455 1434 slab_state = PARTIAL;
1456 /* 2+3) create the kmalloc caches */
1457 1435
1458 /* 1436 /*
1459 * Initialize the caches that provide memory for the array cache and the 1437 * Initialize the caches that provide memory for the kmem_cache_node
1460 * kmem_cache_node structures first. Without this, further allocations will 1438 * structures first. Without this, further allocations will bug.
1461 * bug.
1462 */ 1439 */
1463 1440 kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
1464 kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
1465 kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
1466
1467 if (INDEX_AC != INDEX_NODE)
1468 kmalloc_caches[INDEX_NODE] =
1469 create_kmalloc_cache("kmalloc-node",
1470 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); 1441 kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
1442 slab_state = PARTIAL_NODE;
1471 1443
1472 slab_early_init = 0; 1444 slab_early_init = 0;
1473 1445
1474 /* 4) Replace the bootstrap head arrays */
1475 {
1476 struct array_cache *ptr;
1477
1478 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1479
1480 memcpy(ptr, cpu_cache_get(kmem_cache),
1481 sizeof(struct arraycache_init));
1482
1483 kmem_cache->array[smp_processor_id()] = ptr;
1484
1485 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1486
1487 BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
1488 != &initarray_generic.cache);
1489 memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
1490 sizeof(struct arraycache_init));
1491
1492 kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
1493 }
1494 /* 5) Replace the bootstrap kmem_cache_node */ 1446 /* 5) Replace the bootstrap kmem_cache_node */
1495 { 1447 {
1496 int nid; 1448 int nid;
@@ -1498,13 +1450,8 @@ void __init kmem_cache_init(void)
1498 for_each_online_node(nid) { 1450 for_each_online_node(nid) {
1499 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); 1451 init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
1500 1452
1501 init_list(kmalloc_caches[INDEX_AC], 1453 init_list(kmalloc_caches[INDEX_NODE],
1502 &init_kmem_cache_node[SIZE_AC + nid], nid);
1503
1504 if (INDEX_AC != INDEX_NODE) {
1505 init_list(kmalloc_caches[INDEX_NODE],
1506 &init_kmem_cache_node[SIZE_NODE + nid], nid); 1454 &init_kmem_cache_node[SIZE_NODE + nid], nid);
1507 }
1508 } 1455 }
1509 } 1456 }
1510 1457
@@ -2037,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2037 return left_over; 1984 return left_over;
2038} 1985}
2039 1986
1987static struct array_cache __percpu *alloc_kmem_cache_cpus(
1988 struct kmem_cache *cachep, int entries, int batchcount)
1989{
1990 int cpu;
1991 size_t size;
1992 struct array_cache __percpu *cpu_cache;
1993
1994 size = sizeof(void *) * entries + sizeof(struct array_cache);
1995 cpu_cache = __alloc_percpu(size, 0);
1996
1997 if (!cpu_cache)
1998 return NULL;
1999
2000 for_each_possible_cpu(cpu) {
2001 init_arraycache(per_cpu_ptr(cpu_cache, cpu),
2002 entries, batchcount);
2003 }
2004
2005 return cpu_cache;
2006}
2007
2040static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 2008static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2041{ 2009{
2042 if (slab_state >= FULL) 2010 if (slab_state >= FULL)
2043 return enable_cpucache(cachep, gfp); 2011 return enable_cpucache(cachep, gfp);
2044 2012
2013 cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
2014 if (!cachep->cpu_cache)
2015 return 1;
2016
2045 if (slab_state == DOWN) { 2017 if (slab_state == DOWN) {
2046 /* 2018 /* Creation of first cache (kmem_cache). */
2047 * Note: Creation of first cache (kmem_cache). 2019 set_up_node(kmem_cache, CACHE_CACHE);
2048 * The setup_node is taken care
2049 * of by the caller of __kmem_cache_create
2050 */
2051 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2052 slab_state = PARTIAL;
2053 } else if (slab_state == PARTIAL) { 2020 } else if (slab_state == PARTIAL) {
2054 /* 2021 /* For kmem_cache_node */
2055 * Note: the second kmem_cache_create must create the cache 2022 set_up_node(cachep, SIZE_NODE);
2056 * that's used by kmalloc(24), otherwise the creation of
2057 * further caches will BUG().
2058 */
2059 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2060
2061 /*
2062 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
2063 * the second cache, then we need to set up all its node/,
2064 * otherwise the creation of further caches will BUG().
2065 */
2066 set_up_node(cachep, SIZE_AC);
2067 if (INDEX_AC == INDEX_NODE)
2068 slab_state = PARTIAL_NODE;
2069 else
2070 slab_state = PARTIAL_ARRAYCACHE;
2071 } else { 2023 } else {
2072 /* Remaining boot caches */ 2024 int node;
2073 cachep->array[smp_processor_id()] =
2074 kmalloc(sizeof(struct arraycache_init), gfp);
2075 2025
2076 if (slab_state == PARTIAL_ARRAYCACHE) { 2026 for_each_online_node(node) {
2077 set_up_node(cachep, SIZE_NODE); 2027 cachep->node[node] = kmalloc_node(
2078 slab_state = PARTIAL_NODE; 2028 sizeof(struct kmem_cache_node), gfp, node);
2079 } else { 2029 BUG_ON(!cachep->node[node]);
2080 int node; 2030 kmem_cache_node_init(cachep->node[node]);
2081 for_each_online_node(node) {
2082 cachep->node[node] =
2083 kmalloc_node(sizeof(struct kmem_cache_node),
2084 gfp, node);
2085 BUG_ON(!cachep->node[node]);
2086 kmem_cache_node_init(cachep->node[node]);
2087 }
2088 } 2031 }
2089 } 2032 }
2033
2090 cachep->node[numa_mem_id()]->next_reap = 2034 cachep->node[numa_mem_id()]->next_reap =
2091 jiffies + REAPTIMEOUT_NODE + 2035 jiffies + REAPTIMEOUT_NODE +
2092 ((unsigned long)cachep) % REAPTIMEOUT_NODE; 2036 ((unsigned long)cachep) % REAPTIMEOUT_NODE;
@@ -2100,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2100 return 0; 2044 return 0;
2101} 2045}
2102 2046
2047unsigned long kmem_cache_flags(unsigned long object_size,
2048 unsigned long flags, const char *name,
2049 void (*ctor)(void *))
2050{
2051 return flags;
2052}
2053
2054struct kmem_cache *
2055__kmem_cache_alias(const char *name, size_t size, size_t align,
2056 unsigned long flags, void (*ctor)(void *))
2057{
2058 struct kmem_cache *cachep;
2059
2060 cachep = find_mergeable(size, align, flags, name, ctor);
2061 if (cachep) {
2062 cachep->refcount++;
2063
2064 /*
2065 * Adjust the object sizes so that we clear
2066 * the complete object on kzalloc.
2067 */
2068 cachep->object_size = max_t(int, cachep->object_size, size);
2069 }
2070 return cachep;
2071}
2072
2103/** 2073/**
2104 * __kmem_cache_create - Create a cache. 2074 * __kmem_cache_create - Create a cache.
2105 * @cachep: cache management descriptor 2075 * @cachep: cache management descriptor
@@ -2124,7 +2094,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2124int 2094int
2125__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2095__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2126{ 2096{
2127 size_t left_over, freelist_size, ralign; 2097 size_t left_over, freelist_size;
2098 size_t ralign = BYTES_PER_WORD;
2128 gfp_t gfp; 2099 gfp_t gfp;
2129 int err; 2100 int err;
2130 size_t size = cachep->size; 2101 size_t size = cachep->size;
@@ -2157,14 +2128,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2157 size &= ~(BYTES_PER_WORD - 1); 2128 size &= ~(BYTES_PER_WORD - 1);
2158 } 2129 }
2159 2130
2160 /*
2161 * Redzoning and user store require word alignment or possibly larger.
2162 * Note this will be overridden by architecture or caller mandated
2163 * alignment if either is greater than BYTES_PER_WORD.
2164 */
2165 if (flags & SLAB_STORE_USER)
2166 ralign = BYTES_PER_WORD;
2167
2168 if (flags & SLAB_RED_ZONE) { 2131 if (flags & SLAB_RED_ZONE) {
2169 ralign = REDZONE_ALIGN; 2132 ralign = REDZONE_ALIGN;
2170 /* If redzoning, ensure that the second redzone is suitably 2133 /* If redzoning, ensure that the second redzone is suitably
@@ -2190,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2190 else 2153 else
2191 gfp = GFP_NOWAIT; 2154 gfp = GFP_NOWAIT;
2192 2155
2193 setup_node_pointer(cachep);
2194#if DEBUG 2156#if DEBUG
2195 2157
2196 /* 2158 /*
@@ -2447,8 +2409,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
2447 if (rc) 2409 if (rc)
2448 return rc; 2410 return rc;
2449 2411
2450 for_each_online_cpu(i) 2412 free_percpu(cachep->cpu_cache);
2451 kfree(cachep->array[i]);
2452 2413
2453 /* NUMA: free the node structures */ 2414 /* NUMA: free the node structures */
2454 for_each_kmem_cache_node(cachep, i, n) { 2415 for_each_kmem_cache_node(cachep, i, n) {
@@ -2994,7 +2955,7 @@ out:
2994 2955
2995#ifdef CONFIG_NUMA 2956#ifdef CONFIG_NUMA
2996/* 2957/*
2997 * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. 2958 * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
2998 * 2959 *
2999 * If we are in_interrupt, then process context, including cpusets and 2960 * If we are in_interrupt, then process context, including cpusets and
3000 * mempolicy, may not apply and should not be used for allocation policy. 2961 * mempolicy, may not apply and should not be used for allocation policy.
@@ -3226,7 +3187,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3226{ 3187{
3227 void *objp; 3188 void *objp;
3228 3189
3229 if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { 3190 if (current->mempolicy || cpuset_do_slab_mem_spread()) {
3230 objp = alternate_node_alloc(cache, flags); 3191 objp = alternate_node_alloc(cache, flags);
3231 if (objp) 3192 if (objp)
3232 goto out; 3193 goto out;
@@ -3406,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3406 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3367 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3407 return; 3368 return;
3408 3369
3409 if (likely(ac->avail < ac->limit)) { 3370 if (ac->avail < ac->limit) {
3410 STATS_INC_FREEHIT(cachep); 3371 STATS_INC_FREEHIT(cachep);
3411 } else { 3372 } else {
3412 STATS_INC_FREEMISS(cachep); 3373 STATS_INC_FREEMISS(cachep);
@@ -3503,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3503 return kmem_cache_alloc_node_trace(cachep, flags, node, size); 3464 return kmem_cache_alloc_node_trace(cachep, flags, node, size);
3504} 3465}
3505 3466
3506#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3507void *__kmalloc_node(size_t size, gfp_t flags, int node) 3467void *__kmalloc_node(size_t size, gfp_t flags, int node)
3508{ 3468{
3509 return __do_kmalloc_node(size, flags, node, _RET_IP_); 3469 return __do_kmalloc_node(size, flags, node, _RET_IP_);
@@ -3516,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3516 return __do_kmalloc_node(size, flags, node, caller); 3476 return __do_kmalloc_node(size, flags, node, caller);
3517} 3477}
3518EXPORT_SYMBOL(__kmalloc_node_track_caller); 3478EXPORT_SYMBOL(__kmalloc_node_track_caller);
3519#else
3520void *__kmalloc_node(size_t size, gfp_t flags, int node)
3521{
3522 return __do_kmalloc_node(size, flags, node, 0);
3523}
3524EXPORT_SYMBOL(__kmalloc_node);
3525#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3526#endif /* CONFIG_NUMA */ 3479#endif /* CONFIG_NUMA */
3527 3480
3528/** 3481/**
@@ -3548,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3548 return ret; 3501 return ret;
3549} 3502}
3550 3503
3551
3552#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3553void *__kmalloc(size_t size, gfp_t flags) 3504void *__kmalloc(size_t size, gfp_t flags)
3554{ 3505{
3555 return __do_kmalloc(size, flags, _RET_IP_); 3506 return __do_kmalloc(size, flags, _RET_IP_);
@@ -3562,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3562} 3513}
3563EXPORT_SYMBOL(__kmalloc_track_caller); 3514EXPORT_SYMBOL(__kmalloc_track_caller);
3564 3515
3565#else
3566void *__kmalloc(size_t size, gfp_t flags)
3567{
3568 return __do_kmalloc(size, flags, 0);
3569}
3570EXPORT_SYMBOL(__kmalloc);
3571#endif
3572
3573/** 3516/**
3574 * kmem_cache_free - Deallocate an object 3517 * kmem_cache_free - Deallocate an object
3575 * @cachep: The cache the allocation was from. 3518 * @cachep: The cache the allocation was from.
@@ -3714,72 +3657,45 @@ fail:
3714 return -ENOMEM; 3657 return -ENOMEM;
3715} 3658}
3716 3659
3717struct ccupdate_struct {
3718 struct kmem_cache *cachep;
3719 struct array_cache *new[0];
3720};
3721
3722static void do_ccupdate_local(void *info)
3723{
3724 struct ccupdate_struct *new = info;
3725 struct array_cache *old;
3726
3727 check_irq_off();
3728 old = cpu_cache_get(new->cachep);
3729
3730 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3731 new->new[smp_processor_id()] = old;
3732}
3733
3734/* Always called with the slab_mutex held */ 3660/* Always called with the slab_mutex held */
3735static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, 3661static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
3736 int batchcount, int shared, gfp_t gfp) 3662 int batchcount, int shared, gfp_t gfp)
3737{ 3663{
3738 struct ccupdate_struct *new; 3664 struct array_cache __percpu *cpu_cache, *prev;
3739 int i; 3665 int cpu;
3740 3666
3741 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), 3667 cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
3742 gfp); 3668 if (!cpu_cache)
3743 if (!new)
3744 return -ENOMEM; 3669 return -ENOMEM;
3745 3670
3746 for_each_online_cpu(i) { 3671 prev = cachep->cpu_cache;
3747 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, 3672 cachep->cpu_cache = cpu_cache;
3748 batchcount, gfp); 3673 kick_all_cpus_sync();
3749 if (!new->new[i]) {
3750 for (i--; i >= 0; i--)
3751 kfree(new->new[i]);
3752 kfree(new);
3753 return -ENOMEM;
3754 }
3755 }
3756 new->cachep = cachep;
3757
3758 on_each_cpu(do_ccupdate_local, (void *)new, 1);
3759 3674
3760 check_irq_on(); 3675 check_irq_on();
3761 cachep->batchcount = batchcount; 3676 cachep->batchcount = batchcount;
3762 cachep->limit = limit; 3677 cachep->limit = limit;
3763 cachep->shared = shared; 3678 cachep->shared = shared;
3764 3679
3765 for_each_online_cpu(i) { 3680 if (!prev)
3681 goto alloc_node;
3682
3683 for_each_online_cpu(cpu) {
3766 LIST_HEAD(list); 3684 LIST_HEAD(list);
3767 struct array_cache *ccold = new->new[i];
3768 int node; 3685 int node;
3769 struct kmem_cache_node *n; 3686 struct kmem_cache_node *n;
3687 struct array_cache *ac = per_cpu_ptr(prev, cpu);
3770 3688
3771 if (!ccold) 3689 node = cpu_to_mem(cpu);
3772 continue;
3773
3774 node = cpu_to_mem(i);
3775 n = get_node(cachep, node); 3690 n = get_node(cachep, node);
3776 spin_lock_irq(&n->list_lock); 3691 spin_lock_irq(&n->list_lock);
3777 free_block(cachep, ccold->entry, ccold->avail, node, &list); 3692 free_block(cachep, ac->entry, ac->avail, node, &list);
3778 spin_unlock_irq(&n->list_lock); 3693 spin_unlock_irq(&n->list_lock);
3779 slabs_destroy(cachep, &list); 3694 slabs_destroy(cachep, &list);
3780 kfree(ccold);
3781 } 3695 }
3782 kfree(new); 3696 free_percpu(prev);
3697
3698alloc_node:
3783 return alloc_kmem_cache_node(cachep, gfp); 3699 return alloc_kmem_cache_node(cachep, gfp);
3784} 3700}
3785 3701
@@ -4262,19 +4178,15 @@ static const struct seq_operations slabstats_op = {
4262 4178
4263static int slabstats_open(struct inode *inode, struct file *file) 4179static int slabstats_open(struct inode *inode, struct file *file)
4264{ 4180{
4265 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); 4181 unsigned long *n;
4266 int ret = -ENOMEM; 4182
4267 if (n) { 4183 n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
4268 ret = seq_open(file, &slabstats_op); 4184 if (!n)
4269 if (!ret) { 4185 return -ENOMEM;
4270 struct seq_file *m = file->private_data; 4186
4271 *n = PAGE_SIZE / (2 * sizeof(unsigned long)); 4187 *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4272 m->private = n; 4188
4273 n = NULL; 4189 return 0;
4274 }
4275 kfree(n);
4276 }
4277 return ret;
4278} 4190}
4279 4191
4280static const struct file_operations proc_slabstats_operations = { 4192static const struct file_operations proc_slabstats_operations = {
diff --git a/mm/slab.h b/mm/slab.h
index 0e0fdd365840..ab019e63e3c2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,41 @@
4 * Internal slab definitions 4 * Internal slab definitions
5 */ 5 */
6 6
7#ifdef CONFIG_SLOB
8/*
9 * Common fields provided in kmem_cache by all slab allocators
10 * This struct is either used directly by the allocator (SLOB)
11 * or the allocator must include definitions for all fields
12 * provided in kmem_cache_common in their definition of kmem_cache.
13 *
14 * Once we can do anonymous structs (C11 standard) we could put a
15 * anonymous struct definition in these allocators so that the
16 * separate allocations in the kmem_cache structure of SLAB and
17 * SLUB is no longer needed.
18 */
19struct kmem_cache {
20 unsigned int object_size;/* The original size of the object */
21 unsigned int size; /* The aligned/padded/added on size */
22 unsigned int align; /* Alignment as calculated */
23 unsigned long flags; /* Active flags on the slab */
24 const char *name; /* Slab name for sysfs */
25 int refcount; /* Use counter */
26 void (*ctor)(void *); /* Called on object slot creation */
27 struct list_head list; /* List of all slab caches on the system */
28};
29
30#endif /* CONFIG_SLOB */
31
32#ifdef CONFIG_SLAB
33#include <linux/slab_def.h>
34#endif
35
36#ifdef CONFIG_SLUB
37#include <linux/slub_def.h>
38#endif
39
40#include <linux/memcontrol.h>
41
7/* 42/*
8 * State of the slab allocator. 43 * State of the slab allocator.
9 * 44 *
@@ -15,7 +50,6 @@
15enum slab_state { 50enum slab_state {
16 DOWN, /* No slab functionality yet */ 51 DOWN, /* No slab functionality yet */
17 PARTIAL, /* SLUB: kmem_cache_node available */ 52 PARTIAL, /* SLUB: kmem_cache_node available */
18 PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
19 PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ 53 PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
20 UP, /* Slab caches usable but not all extras yet */ 54 UP, /* Slab caches usable but not all extras yet */
21 FULL /* Everything is working */ 55 FULL /* Everything is working */
@@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
53 size_t size, unsigned long flags); 87 size_t size, unsigned long flags);
54 88
55struct mem_cgroup; 89struct mem_cgroup;
56#ifdef CONFIG_SLUB 90
91int slab_unmergeable(struct kmem_cache *s);
92struct kmem_cache *find_mergeable(size_t size, size_t align,
93 unsigned long flags, const char *name, void (*ctor)(void *));
94#ifndef CONFIG_SLOB
57struct kmem_cache * 95struct kmem_cache *
58__kmem_cache_alias(const char *name, size_t size, size_t align, 96__kmem_cache_alias(const char *name, size_t size, size_t align,
59 unsigned long flags, void (*ctor)(void *)); 97 unsigned long flags, void (*ctor)(void *));
98
99unsigned long kmem_cache_flags(unsigned long object_size,
100 unsigned long flags, const char *name,
101 void (*ctor)(void *));
60#else 102#else
61static inline struct kmem_cache * 103static inline struct kmem_cache *
62__kmem_cache_alias(const char *name, size_t size, size_t align, 104__kmem_cache_alias(const char *name, size_t size, size_t align,
63 unsigned long flags, void (*ctor)(void *)) 105 unsigned long flags, void (*ctor)(void *))
64{ return NULL; } 106{ return NULL; }
107
108static inline unsigned long kmem_cache_flags(unsigned long object_size,
109 unsigned long flags, const char *name,
110 void (*ctor)(void *))
111{
112 return flags;
113}
65#endif 114#endif
66 115
67 116
@@ -303,8 +352,8 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
303 * a kmem_cache_node structure allocated (which is true for all online nodes) 352 * a kmem_cache_node structure allocated (which is true for all online nodes)
304 */ 353 */
305#define for_each_kmem_cache_node(__s, __node, __n) \ 354#define for_each_kmem_cache_node(__s, __node, __n) \
306 for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ 355 for (__node = 0; __node < nr_node_ids; __node++) \
307 if (__n) 356 if ((__n = get_node(__s, __node)))
308 357
309#endif 358#endif
310 359
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d319502b2403..3a6e0cfdf03a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -30,6 +30,43 @@ LIST_HEAD(slab_caches);
30DEFINE_MUTEX(slab_mutex); 30DEFINE_MUTEX(slab_mutex);
31struct kmem_cache *kmem_cache; 31struct kmem_cache *kmem_cache;
32 32
33/*
34 * Set of flags that will prevent slab merging
35 */
36#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
37 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
38 SLAB_FAILSLAB)
39
40#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
41 SLAB_CACHE_DMA | SLAB_NOTRACK)
42
43/*
44 * Merge control. If this is set then no merging of slab caches will occur.
45 * (Could be removed. This was introduced to pacify the merge skeptics.)
46 */
47static int slab_nomerge;
48
49static int __init setup_slab_nomerge(char *str)
50{
51 slab_nomerge = 1;
52 return 1;
53}
54
55#ifdef CONFIG_SLUB
56__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
57#endif
58
59__setup("slab_nomerge", setup_slab_nomerge);
60
61/*
62 * Determine the size of a slab object
63 */
64unsigned int kmem_cache_size(struct kmem_cache *s)
65{
66 return s->object_size;
67}
68EXPORT_SYMBOL(kmem_cache_size);
69
33#ifdef CONFIG_DEBUG_VM 70#ifdef CONFIG_DEBUG_VM
34static int kmem_cache_sanity_check(const char *name, size_t size) 71static int kmem_cache_sanity_check(const char *name, size_t size)
35{ 72{
@@ -79,6 +116,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
79#endif 116#endif
80 117
81#ifdef CONFIG_MEMCG_KMEM 118#ifdef CONFIG_MEMCG_KMEM
119static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
120 struct kmem_cache *s, struct kmem_cache *root_cache)
121{
122 size_t size;
123
124 if (!memcg_kmem_enabled())
125 return 0;
126
127 if (!memcg) {
128 size = offsetof(struct memcg_cache_params, memcg_caches);
129 size += memcg_limited_groups_array_size * sizeof(void *);
130 } else
131 size = sizeof(struct memcg_cache_params);
132
133 s->memcg_params = kzalloc(size, GFP_KERNEL);
134 if (!s->memcg_params)
135 return -ENOMEM;
136
137 if (memcg) {
138 s->memcg_params->memcg = memcg;
139 s->memcg_params->root_cache = root_cache;
140 } else
141 s->memcg_params->is_root_cache = true;
142
143 return 0;
144}
145
146static void memcg_free_cache_params(struct kmem_cache *s)
147{
148 kfree(s->memcg_params);
149}
150
151static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
152{
153 int size;
154 struct memcg_cache_params *new_params, *cur_params;
155
156 BUG_ON(!is_root_cache(s));
157
158 size = offsetof(struct memcg_cache_params, memcg_caches);
159 size += num_memcgs * sizeof(void *);
160
161 new_params = kzalloc(size, GFP_KERNEL);
162 if (!new_params)
163 return -ENOMEM;
164
165 cur_params = s->memcg_params;
166 memcpy(new_params->memcg_caches, cur_params->memcg_caches,
167 memcg_limited_groups_array_size * sizeof(void *));
168
169 new_params->is_root_cache = true;
170
171 rcu_assign_pointer(s->memcg_params, new_params);
172 if (cur_params)
173 kfree_rcu(cur_params, rcu_head);
174
175 return 0;
176}
177
82int memcg_update_all_caches(int num_memcgs) 178int memcg_update_all_caches(int num_memcgs)
83{ 179{
84 struct kmem_cache *s; 180 struct kmem_cache *s;
@@ -89,9 +185,8 @@ int memcg_update_all_caches(int num_memcgs)
89 if (!is_root_cache(s)) 185 if (!is_root_cache(s))
90 continue; 186 continue;
91 187
92 ret = memcg_update_cache_size(s, num_memcgs); 188 ret = memcg_update_cache_params(s, num_memcgs);
93 /* 189 /*
94 * See comment in memcontrol.c, memcg_update_cache_size:
95 * Instead of freeing the memory, we'll just leave the caches 190 * Instead of freeing the memory, we'll just leave the caches
96 * up to this point in an updated state. 191 * up to this point in an updated state.
97 */ 192 */
@@ -104,7 +199,80 @@ out:
104 mutex_unlock(&slab_mutex); 199 mutex_unlock(&slab_mutex);
105 return ret; 200 return ret;
106} 201}
107#endif 202#else
203static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
204 struct kmem_cache *s, struct kmem_cache *root_cache)
205{
206 return 0;
207}
208
209static inline void memcg_free_cache_params(struct kmem_cache *s)
210{
211}
212#endif /* CONFIG_MEMCG_KMEM */
213
214/*
215 * Find a mergeable slab cache
216 */
217int slab_unmergeable(struct kmem_cache *s)
218{
219 if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
220 return 1;
221
222 if (!is_root_cache(s))
223 return 1;
224
225 if (s->ctor)
226 return 1;
227
228 /*
229 * We may have set a slab to be unmergeable during bootstrap.
230 */
231 if (s->refcount < 0)
232 return 1;
233
234 return 0;
235}
236
237struct kmem_cache *find_mergeable(size_t size, size_t align,
238 unsigned long flags, const char *name, void (*ctor)(void *))
239{
240 struct kmem_cache *s;
241
242 if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
243 return NULL;
244
245 if (ctor)
246 return NULL;
247
248 size = ALIGN(size, sizeof(void *));
249 align = calculate_alignment(flags, align, size);
250 size = ALIGN(size, align);
251 flags = kmem_cache_flags(size, flags, name, NULL);
252
253 list_for_each_entry(s, &slab_caches, list) {
254 if (slab_unmergeable(s))
255 continue;
256
257 if (size > s->size)
258 continue;
259
260 if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
261 continue;
262 /*
263 * Check if alignment is compatible.
264 * Courtesy of Adrian Drzewiecki
265 */
266 if ((s->size & ~(align - 1)) != s->size)
267 continue;
268
269 if (s->size - size >= sizeof(void *))
270 continue;
271
272 return s;
273 }
274 return NULL;
275}
108 276
109/* 277/*
110 * Figure out what the alignment of the objects will be given a set of 278 * Figure out what the alignment of the objects will be given a set of
@@ -211,8 +379,10 @@ kmem_cache_create(const char *name, size_t size, size_t align,
211 mutex_lock(&slab_mutex); 379 mutex_lock(&slab_mutex);
212 380
213 err = kmem_cache_sanity_check(name, size); 381 err = kmem_cache_sanity_check(name, size);
214 if (err) 382 if (err) {
383 s = NULL; /* suppress uninit var warning */
215 goto out_unlock; 384 goto out_unlock;
385 }
216 386
217 /* 387 /*
218 * Some allocators will constraint the set of valid flags to a subset 388 * Some allocators will constraint the set of valid flags to a subset
diff --git a/mm/slob.c b/mm/slob.c
index 21980e0f39a8..96a86206a26b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp)
468} 468}
469EXPORT_SYMBOL(__kmalloc); 469EXPORT_SYMBOL(__kmalloc);
470 470
471#ifdef CONFIG_TRACING
472void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) 471void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
473{ 472{
474 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); 473 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
@@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
481 return __do_kmalloc_node(size, gfp, node, caller); 480 return __do_kmalloc_node(size, gfp, node, caller);
482} 481}
483#endif 482#endif
484#endif
485 483
486void kfree(const void *block) 484void kfree(const void *block)
487{ 485{
diff --git a/mm/slub.c b/mm/slub.c
index 3e8afcc07a76..ae7b9f1ad394 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
169 */ 169 */
170#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 170#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
171 171
172/*
173 * Set of flags that will prevent slab merging
174 */
175#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
176 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
177 SLAB_FAILSLAB)
178
179#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
180 SLAB_CACHE_DMA | SLAB_NOTRACK)
181
182#define OO_SHIFT 16 172#define OO_SHIFT 16
183#define OO_MASK ((1 << OO_SHIFT) - 1) 173#define OO_MASK ((1 << OO_SHIFT) - 1)
184#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ 174#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
@@ -1176,7 +1166,7 @@ out:
1176 1166
1177__setup("slub_debug", setup_slub_debug); 1167__setup("slub_debug", setup_slub_debug);
1178 1168
1179static unsigned long kmem_cache_flags(unsigned long object_size, 1169unsigned long kmem_cache_flags(unsigned long object_size,
1180 unsigned long flags, const char *name, 1170 unsigned long flags, const char *name,
1181 void (*ctor)(void *)) 1171 void (*ctor)(void *))
1182{ 1172{
@@ -1208,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1208 struct page *page) {} 1198 struct page *page) {}
1209static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, 1199static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1210 struct page *page) {} 1200 struct page *page) {}
1211static inline unsigned long kmem_cache_flags(unsigned long object_size, 1201unsigned long kmem_cache_flags(unsigned long object_size,
1212 unsigned long flags, const char *name, 1202 unsigned long flags, const char *name,
1213 void (*ctor)(void *)) 1203 void (*ctor)(void *))
1214{ 1204{
@@ -1699,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
1699 struct kmem_cache_cpu *c) 1689 struct kmem_cache_cpu *c)
1700{ 1690{
1701 void *object; 1691 void *object;
1702 int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; 1692 int searchnode = node;
1693
1694 if (node == NUMA_NO_NODE)
1695 searchnode = numa_mem_id();
1696 else if (!node_present_pages(node))
1697 searchnode = node_to_mem_node(node);
1703 1698
1704 object = get_partial_node(s, get_node(s, searchnode), c, flags); 1699 object = get_partial_node(s, get_node(s, searchnode), c, flags);
1705 if (object || node != NUMA_NO_NODE) 1700 if (object || node != NUMA_NO_NODE)
@@ -2280,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2280redo: 2275redo:
2281 2276
2282 if (unlikely(!node_match(page, node))) { 2277 if (unlikely(!node_match(page, node))) {
2283 stat(s, ALLOC_NODE_MISMATCH); 2278 int searchnode = node;
2284 deactivate_slab(s, page, c->freelist); 2279
2285 c->page = NULL; 2280 if (node != NUMA_NO_NODE && !node_present_pages(node))
2286 c->freelist = NULL; 2281 searchnode = node_to_mem_node(node);
2287 goto new_slab; 2282
2283 if (unlikely(!node_match(page, searchnode))) {
2284 stat(s, ALLOC_NODE_MISMATCH);
2285 deactivate_slab(s, page, c->freelist);
2286 c->page = NULL;
2287 c->freelist = NULL;
2288 goto new_slab;
2289 }
2288 } 2290 }
2289 2291
2290 /* 2292 /*
@@ -2707,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
2707static int slub_min_objects; 2709static int slub_min_objects;
2708 2710
2709/* 2711/*
2710 * Merge control. If this is set then no merging of slab caches will occur.
2711 * (Could be removed. This was introduced to pacify the merge skeptics.)
2712 */
2713static int slub_nomerge;
2714
2715/*
2716 * Calculate the order of allocation given an slab object size. 2712 * Calculate the order of allocation given an slab object size.
2717 * 2713 *
2718 * The order of allocation has significant impact on performance and other 2714 * The order of allocation has significant impact on performance and other
@@ -3240,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str)
3240 3236
3241__setup("slub_min_objects=", setup_slub_min_objects); 3237__setup("slub_min_objects=", setup_slub_min_objects);
3242 3238
3243static int __init setup_slub_nomerge(char *str)
3244{
3245 slub_nomerge = 1;
3246 return 1;
3247}
3248
3249__setup("slub_nomerge", setup_slub_nomerge);
3250
3251void *__kmalloc(size_t size, gfp_t flags) 3239void *__kmalloc(size_t size, gfp_t flags)
3252{ 3240{
3253 struct kmem_cache *s; 3241 struct kmem_cache *s;
@@ -3625,69 +3613,6 @@ void __init kmem_cache_init_late(void)
3625{ 3613{
3626} 3614}
3627 3615
3628/*
3629 * Find a mergeable slab cache
3630 */
3631static int slab_unmergeable(struct kmem_cache *s)
3632{
3633 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3634 return 1;
3635
3636 if (!is_root_cache(s))
3637 return 1;
3638
3639 if (s->ctor)
3640 return 1;
3641
3642 /*
3643 * We may have set a slab to be unmergeable during bootstrap.
3644 */
3645 if (s->refcount < 0)
3646 return 1;
3647
3648 return 0;
3649}
3650
3651static struct kmem_cache *find_mergeable(size_t size, size_t align,
3652 unsigned long flags, const char *name, void (*ctor)(void *))
3653{
3654 struct kmem_cache *s;
3655
3656 if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
3657 return NULL;
3658
3659 if (ctor)
3660 return NULL;
3661
3662 size = ALIGN(size, sizeof(void *));
3663 align = calculate_alignment(flags, align, size);
3664 size = ALIGN(size, align);
3665 flags = kmem_cache_flags(size, flags, name, NULL);
3666
3667 list_for_each_entry(s, &slab_caches, list) {
3668 if (slab_unmergeable(s))
3669 continue;
3670
3671 if (size > s->size)
3672 continue;
3673
3674 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3675 continue;
3676 /*
3677 * Check if alignment is compatible.
3678 * Courtesy of Adrian Drzewiecki
3679 */
3680 if ((s->size & ~(align - 1)) != s->size)
3681 continue;
3682
3683 if (s->size - size >= sizeof(void *))
3684 continue;
3685
3686 return s;
3687 }
3688 return NULL;
3689}
3690
3691struct kmem_cache * 3616struct kmem_cache *
3692__kmem_cache_alias(const char *name, size_t size, size_t align, 3617__kmem_cache_alias(const char *name, size_t size, size_t align,
3693 unsigned long flags, void (*ctor)(void *)) 3618 unsigned long flags, void (*ctor)(void *))
@@ -4604,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf)
4604static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4529static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4605 size_t length) 4530 size_t length)
4606{ 4531{
4532 /*
4533 * Tracing a merged cache is going to give confusing results
4534 * as well as cause other issues like converting a mergeable
4535 * cache into an umergeable one.
4536 */
4537 if (s->refcount > 1)
4538 return -EINVAL;
4539
4607 s->flags &= ~SLAB_TRACE; 4540 s->flags &= ~SLAB_TRACE;
4608 if (buf[0] == '1') { 4541 if (buf[0] == '1') {
4609 s->flags &= ~__CMPXCHG_DOUBLE; 4542 s->flags &= ~__CMPXCHG_DOUBLE;
@@ -4721,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4721static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4654static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4722 size_t length) 4655 size_t length)
4723{ 4656{
4657 if (s->refcount > 1)
4658 return -EINVAL;
4659
4724 s->flags &= ~SLAB_FAILSLAB; 4660 s->flags &= ~SLAB_FAILSLAB;
4725 if (buf[0] == '1') 4661 if (buf[0] == '1')
4726 s->flags |= SLAB_FAILSLAB; 4662 s->flags |= SLAB_FAILSLAB;
diff --git a/mm/swap.c b/mm/swap.c
index 6b2dc3897cd5..8a12b33936b4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -887,18 +887,14 @@ void lru_add_drain_all(void)
887 mutex_unlock(&lock); 887 mutex_unlock(&lock);
888} 888}
889 889
890/* 890/**
891 * Batched page_cache_release(). Decrement the reference count on all the 891 * release_pages - batched page_cache_release()
892 * passed pages. If it fell to zero then remove the page from the LRU and 892 * @pages: array of pages to release
893 * free it. 893 * @nr: number of pages
894 * 894 * @cold: whether the pages are cache cold
895 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
896 * for the remainder of the operation.
897 * 895 *
898 * The locking in this function is against shrink_inactive_list(): we recheck 896 * Decrement the reference count on all the pages in @pages. If it
899 * the page count inside the lock to see whether shrink_inactive_list() 897 * fell to zero, remove the page from the LRU and free it.
900 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
901 * will free it.
902 */ 898 */
903void release_pages(struct page **pages, int nr, bool cold) 899void release_pages(struct page **pages, int nr, bool cold)
904{ 900{
@@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold)
907 struct zone *zone = NULL; 903 struct zone *zone = NULL;
908 struct lruvec *lruvec; 904 struct lruvec *lruvec;
909 unsigned long uninitialized_var(flags); 905 unsigned long uninitialized_var(flags);
906 unsigned int uninitialized_var(lock_batch);
910 907
911 for (i = 0; i < nr; i++) { 908 for (i = 0; i < nr; i++) {
912 struct page *page = pages[i]; 909 struct page *page = pages[i];
@@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold)
920 continue; 917 continue;
921 } 918 }
922 919
920 /*
921 * Make sure the IRQ-safe lock-holding time does not get
922 * excessive with a continuous string of pages from the
923 * same zone. The lock is held only if zone != NULL.
924 */
925 if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
926 spin_unlock_irqrestore(&zone->lru_lock, flags);
927 zone = NULL;
928 }
929
923 if (!put_page_testzero(page)) 930 if (!put_page_testzero(page))
924 continue; 931 continue;
925 932
@@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold)
930 if (zone) 937 if (zone)
931 spin_unlock_irqrestore(&zone->lru_lock, 938 spin_unlock_irqrestore(&zone->lru_lock,
932 flags); 939 flags);
940 lock_batch = 0;
933 zone = pagezone; 941 zone = pagezone;
934 spin_lock_irqsave(&zone->lru_lock, flags); 942 spin_lock_irqsave(&zone->lru_lock, flags);
935 } 943 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e0ec83d000c..154444918685 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -28,7 +28,9 @@
28static const struct address_space_operations swap_aops = { 28static const struct address_space_operations swap_aops = {
29 .writepage = swap_writepage, 29 .writepage = swap_writepage,
30 .set_page_dirty = swap_set_page_dirty, 30 .set_page_dirty = swap_set_page_dirty,
31#ifdef CONFIG_MIGRATION
31 .migratepage = migrate_page, 32 .migratepage = migrate_page,
33#endif
32}; 34};
33 35
34static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
@@ -263,18 +265,12 @@ void free_page_and_swap_cache(struct page *page)
263void free_pages_and_swap_cache(struct page **pages, int nr) 265void free_pages_and_swap_cache(struct page **pages, int nr)
264{ 266{
265 struct page **pagep = pages; 267 struct page **pagep = pages;
268 int i;
266 269
267 lru_add_drain(); 270 lru_add_drain();
268 while (nr) { 271 for (i = 0; i < nr; i++)
269 int todo = min(nr, PAGEVEC_SIZE); 272 free_swap_cache(pagep[i]);
270 int i; 273 release_pages(pagep, nr, false);
271
272 for (i = 0; i < todo; i++)
273 free_swap_cache(pagep[i]);
274 release_pages(pagep, todo, false);
275 pagep += todo;
276 nr -= todo;
277 }
278} 274}
279 275
280/* 276/*
diff --git a/mm/util.c b/mm/util.c
index 093c973f1697..fec39d4509a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -170,32 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t,
170/* 170/*
171 * Check if the vma is being used as a stack. 171 * Check if the vma is being used as a stack.
172 * If is_group is non-zero, check in the entire thread group or else 172 * If is_group is non-zero, check in the entire thread group or else
173 * just check in the current task. Returns the pid of the task that 173 * just check in the current task. Returns the task_struct of the task
174 * the vma is stack for. 174 * that the vma is stack for. Must be called under rcu_read_lock().
175 */ 175 */
176pid_t vm_is_stack(struct task_struct *task, 176struct task_struct *task_of_stack(struct task_struct *task,
177 struct vm_area_struct *vma, int in_group) 177 struct vm_area_struct *vma, bool in_group)
178{ 178{
179 pid_t ret = 0;
180
181 if (vm_is_stack_for_task(task, vma)) 179 if (vm_is_stack_for_task(task, vma))
182 return task->pid; 180 return task;
183 181
184 if (in_group) { 182 if (in_group) {
185 struct task_struct *t; 183 struct task_struct *t;
186 184
187 rcu_read_lock();
188 for_each_thread(task, t) { 185 for_each_thread(task, t) {
189 if (vm_is_stack_for_task(t, vma)) { 186 if (vm_is_stack_for_task(t, vma))
190 ret = t->pid; 187 return t;
191 goto done;
192 }
193 } 188 }
194done:
195 rcu_read_unlock();
196 } 189 }
197 190
198 return ret; 191 return NULL;
199} 192}
200 193
201#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 194#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2b0aa5486092..90520af7f186 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = {
2646 2646
2647static int vmalloc_open(struct inode *inode, struct file *file) 2647static int vmalloc_open(struct inode *inode, struct file *file)
2648{ 2648{
2649 unsigned int *ptr = NULL; 2649 if (IS_ENABLED(CONFIG_NUMA))
2650 int ret; 2650 return seq_open_private(file, &vmalloc_op,
2651 2651 nr_node_ids * sizeof(unsigned int));
2652 if (IS_ENABLED(CONFIG_NUMA)) { 2652 else
2653 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2653 return seq_open(file, &vmalloc_op);
2654 if (ptr == NULL)
2655 return -ENOMEM;
2656 }
2657 ret = seq_open(file, &vmalloc_op);
2658 if (!ret) {
2659 struct seq_file *m = file->private_data;
2660 m->private = ptr;
2661 } else
2662 kfree(ptr);
2663 return ret;
2664} 2654}
2665 2655
2666static const struct file_operations proc_vmalloc_operations = { 2656static const struct file_operations proc_vmalloc_operations = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2836b5373b2e..dcb47074ae03 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
920 /* Case 1 above */ 920 /* Case 1 above */
921 if (current_is_kswapd() && 921 if (current_is_kswapd() &&
922 PageReclaim(page) && 922 PageReclaim(page) &&
923 zone_is_reclaim_writeback(zone)) { 923 test_bit(ZONE_WRITEBACK, &zone->flags)) {
924 nr_immediate++; 924 nr_immediate++;
925 goto keep_locked; 925 goto keep_locked;
926 926
@@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1002 */ 1002 */
1003 if (page_is_file_cache(page) && 1003 if (page_is_file_cache(page) &&
1004 (!current_is_kswapd() || 1004 (!current_is_kswapd() ||
1005 !zone_is_reclaim_dirty(zone))) { 1005 !test_bit(ZONE_DIRTY, &zone->flags))) {
1006 /* 1006 /*
1007 * Immediately reclaim when written back. 1007 * Immediately reclaim when written back.
1008 * Similar in principal to deactivate_page() 1008 * Similar in principal to deactivate_page()
@@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1563 * are encountered in the nr_immediate check below. 1563 * are encountered in the nr_immediate check below.
1564 */ 1564 */
1565 if (nr_writeback && nr_writeback == nr_taken) 1565 if (nr_writeback && nr_writeback == nr_taken)
1566 zone_set_flag(zone, ZONE_WRITEBACK); 1566 set_bit(ZONE_WRITEBACK, &zone->flags);
1567 1567
1568 /* 1568 /*
1569 * memcg will stall in page writeback so only consider forcibly 1569 * memcg will stall in page writeback so only consider forcibly
@@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1575 * backed by a congested BDI and wait_iff_congested will stall. 1575 * backed by a congested BDI and wait_iff_congested will stall.
1576 */ 1576 */
1577 if (nr_dirty && nr_dirty == nr_congested) 1577 if (nr_dirty && nr_dirty == nr_congested)
1578 zone_set_flag(zone, ZONE_CONGESTED); 1578 set_bit(ZONE_CONGESTED, &zone->flags);
1579 1579
1580 /* 1580 /*
1581 * If dirty pages are scanned that are not queued for IO, it 1581 * If dirty pages are scanned that are not queued for IO, it
1582 * implies that flushers are not keeping up. In this case, flag 1582 * implies that flushers are not keeping up. In this case, flag
1583 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing 1583 * the zone ZONE_DIRTY and kswapd will start writing pages from
1584 * pages from reclaim context. 1584 * reclaim context.
1585 */ 1585 */
1586 if (nr_unqueued_dirty == nr_taken) 1586 if (nr_unqueued_dirty == nr_taken)
1587 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); 1587 set_bit(ZONE_DIRTY, &zone->flags);
1588 1588
1589 /* 1589 /*
1590 * If kswapd scans pages marked marked for immediate 1590 * If kswapd scans pages marked marked for immediate
@@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2315 return reclaimable; 2315 return reclaimable;
2316} 2316}
2317 2317
2318/* Returns true if compaction should go ahead for a high-order request */ 2318/*
2319 * Returns true if compaction should go ahead for a high-order request, or
2320 * the high-order allocation would succeed without compaction.
2321 */
2319static inline bool compaction_ready(struct zone *zone, int order) 2322static inline bool compaction_ready(struct zone *zone, int order)
2320{ 2323{
2321 unsigned long balance_gap, watermark; 2324 unsigned long balance_gap, watermark;
@@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order)
2339 if (compaction_deferred(zone, order)) 2342 if (compaction_deferred(zone, order))
2340 return watermark_ok; 2343 return watermark_ok;
2341 2344
2342 /* If compaction is not ready to start, keep reclaiming */ 2345 /*
2343 if (!compaction_suitable(zone, order)) 2346 * If compaction is not ready to start and allocation is not likely
2347 * to succeed without it, then keep reclaiming.
2348 */
2349 if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
2344 return false; 2350 return false;
2345 2351
2346 return watermark_ok; 2352 return watermark_ok;
@@ -2753,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2753} 2759}
2754 2760
2755unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 2761unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2762 unsigned long nr_pages,
2756 gfp_t gfp_mask, 2763 gfp_t gfp_mask,
2757 bool noswap) 2764 bool may_swap)
2758{ 2765{
2759 struct zonelist *zonelist; 2766 struct zonelist *zonelist;
2760 unsigned long nr_reclaimed; 2767 unsigned long nr_reclaimed;
2761 int nid; 2768 int nid;
2762 struct scan_control sc = { 2769 struct scan_control sc = {
2763 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2770 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
2764 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2771 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2765 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2772 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2766 .target_mem_cgroup = memcg, 2773 .target_mem_cgroup = memcg,
2767 .priority = DEF_PRIORITY, 2774 .priority = DEF_PRIORITY,
2768 .may_writepage = !laptop_mode, 2775 .may_writepage = !laptop_mode,
2769 .may_unmap = 1, 2776 .may_unmap = 1,
2770 .may_swap = !noswap, 2777 .may_swap = may_swap,
2771 }; 2778 };
2772 2779
2773 /* 2780 /*
@@ -2818,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order,
2818 return false; 2825 return false;
2819 2826
2820 if (IS_ENABLED(CONFIG_COMPACTION) && order && 2827 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2821 !compaction_suitable(zone, order)) 2828 compaction_suitable(zone, order) == COMPACT_SKIPPED)
2822 return false; 2829 return false;
2823 2830
2824 return true; 2831 return true;
@@ -2978,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
2978 /* Account for the number of pages attempted to reclaim */ 2985 /* Account for the number of pages attempted to reclaim */
2979 *nr_attempted += sc->nr_to_reclaim; 2986 *nr_attempted += sc->nr_to_reclaim;
2980 2987
2981 zone_clear_flag(zone, ZONE_WRITEBACK); 2988 clear_bit(ZONE_WRITEBACK, &zone->flags);
2982 2989
2983 /* 2990 /*
2984 * If a zone reaches its high watermark, consider it to be no longer 2991 * If a zone reaches its high watermark, consider it to be no longer
@@ -2988,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2988 */ 2995 */
2989 if (zone_reclaimable(zone) && 2996 if (zone_reclaimable(zone) &&
2990 zone_balanced(zone, testorder, 0, classzone_idx)) { 2997 zone_balanced(zone, testorder, 0, classzone_idx)) {
2991 zone_clear_flag(zone, ZONE_CONGESTED); 2998 clear_bit(ZONE_CONGESTED, &zone->flags);
2992 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2999 clear_bit(ZONE_DIRTY, &zone->flags);
2993 } 3000 }
2994 3001
2995 return sc->nr_scanned >= sc->nr_to_reclaim; 3002 return sc->nr_scanned >= sc->nr_to_reclaim;
@@ -3080,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3080 * If balanced, clear the dirty and congested 3087 * If balanced, clear the dirty and congested
3081 * flags 3088 * flags
3082 */ 3089 */
3083 zone_clear_flag(zone, ZONE_CONGESTED); 3090 clear_bit(ZONE_CONGESTED, &zone->flags);
3084 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 3091 clear_bit(ZONE_DIRTY, &zone->flags);
3085 } 3092 }
3086 } 3093 }
3087 3094
@@ -3708,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3708 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3715 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3709 return ZONE_RECLAIM_NOSCAN; 3716 return ZONE_RECLAIM_NOSCAN;
3710 3717
3711 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3718 if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
3712 return ZONE_RECLAIM_NOSCAN; 3719 return ZONE_RECLAIM_NOSCAN;
3713 3720
3714 ret = __zone_reclaim(zone, gfp_mask, order); 3721 ret = __zone_reclaim(zone, gfp_mask, order);
3715 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3722 clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
3716 3723
3717 if (!ret) 3724 if (!ret)
3718 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3725 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3791,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3791 } 3798 }
3792} 3799}
3793#endif /* CONFIG_SHMEM */ 3800#endif /* CONFIG_SHMEM */
3794
3795static void warn_scan_unevictable_pages(void)
3796{
3797 printk_once(KERN_WARNING
3798 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3799 "disabled for lack of a legitimate use case. If you have "
3800 "one, please send an email to linux-mm@kvack.org.\n",
3801 current->comm);
3802}
3803
3804/*
3805 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
3806 * all nodes' unevictable lists for evictable pages
3807 */
3808unsigned long scan_unevictable_pages;
3809
3810int scan_unevictable_handler(struct ctl_table *table, int write,
3811 void __user *buffer,
3812 size_t *length, loff_t *ppos)
3813{
3814 warn_scan_unevictable_pages();
3815 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3816 scan_unevictable_pages = 0;
3817 return 0;
3818}
3819
3820#ifdef CONFIG_NUMA
3821/*
3822 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
3823 * a specified node's per zone unevictable lists for evictable pages.
3824 */
3825
3826static ssize_t read_scan_unevictable_node(struct device *dev,
3827 struct device_attribute *attr,
3828 char *buf)
3829{
3830 warn_scan_unevictable_pages();
3831 return sprintf(buf, "0\n"); /* always zero; should fit... */
3832}
3833
3834static ssize_t write_scan_unevictable_node(struct device *dev,
3835 struct device_attribute *attr,
3836 const char *buf, size_t count)
3837{
3838 warn_scan_unevictable_pages();
3839 return 1;
3840}
3841
3842
3843static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3844 read_scan_unevictable_node,
3845 write_scan_unevictable_node);
3846
3847int scan_unevictable_register_node(struct node *node)
3848{
3849 return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3850}
3851
3852void scan_unevictable_unregister_node(struct node *node)
3853{
3854 device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3855}
3856#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e9ab104b956f..1b12d390dc68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,6 +7,7 @@
7 * zoned VM statistics 7 * zoned VM statistics
8 * Copyright (C) 2006 Silicon Graphics, Inc., 8 * Copyright (C) 2006 Silicon Graphics, Inc.,
9 * Christoph Lameter <christoph@lameter.com> 9 * Christoph Lameter <christoph@lameter.com>
10 * Copyright (C) 2008-2014 Christoph Lameter
10 */ 11 */
11#include <linux/fs.h> 12#include <linux/fs.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
@@ -14,6 +15,7 @@
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/cpumask.h>
17#include <linux/vmstat.h> 19#include <linux/vmstat.h>
18#include <linux/sched.h> 20#include <linux/sched.h>
19#include <linux/math64.h> 21#include <linux/math64.h>
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
419EXPORT_SYMBOL(dec_zone_page_state); 421EXPORT_SYMBOL(dec_zone_page_state);
420#endif 422#endif
421 423
422static inline void fold_diff(int *diff) 424
425/*
426 * Fold a differential into the global counters.
427 * Returns the number of counters updated.
428 */
429static int fold_diff(int *diff)
423{ 430{
424 int i; 431 int i;
432 int changes = 0;
425 433
426 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 434 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
427 if (diff[i]) 435 if (diff[i]) {
428 atomic_long_add(diff[i], &vm_stat[i]); 436 atomic_long_add(diff[i], &vm_stat[i]);
437 changes++;
438 }
439 return changes;
429} 440}
430 441
431/* 442/*
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff)
441 * statistics in the remote zone struct as well as the global cachelines 452 * statistics in the remote zone struct as well as the global cachelines
442 * with the global counters. These could cause remote node cache line 453 * with the global counters. These could cause remote node cache line
443 * bouncing and will have to be only done when necessary. 454 * bouncing and will have to be only done when necessary.
455 *
456 * The function returns the number of global counters updated.
444 */ 457 */
445static void refresh_cpu_vm_stats(void) 458static int refresh_cpu_vm_stats(void)
446{ 459{
447 struct zone *zone; 460 struct zone *zone;
448 int i; 461 int i;
449 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 462 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
463 int changes = 0;
450 464
451 for_each_populated_zone(zone) { 465 for_each_populated_zone(zone) {
452 struct per_cpu_pageset __percpu *p = zone->pageset; 466 struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void)
486 continue; 500 continue;
487 } 501 }
488 502
489
490 if (__this_cpu_dec_return(p->expire)) 503 if (__this_cpu_dec_return(p->expire))
491 continue; 504 continue;
492 505
493 if (__this_cpu_read(p->pcp.count)) 506 if (__this_cpu_read(p->pcp.count)) {
494 drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); 507 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
508 changes++;
509 }
495#endif 510#endif
496 } 511 }
497 fold_diff(global_diff); 512 changes += fold_diff(global_diff);
513 return changes;
498} 514}
499 515
500/* 516/*
@@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
735 TEXT_FOR_HIGHMEM(xx) xx "_movable", 751 TEXT_FOR_HIGHMEM(xx) xx "_movable",
736 752
737const char * const vmstat_text[] = { 753const char * const vmstat_text[] = {
738 /* Zoned VM counters */ 754 /* enum zone_stat_item countes */
739 "nr_free_pages", 755 "nr_free_pages",
740 "nr_alloc_batch", 756 "nr_alloc_batch",
741 "nr_inactive_anon", 757 "nr_inactive_anon",
@@ -778,10 +794,13 @@ const char * const vmstat_text[] = {
778 "workingset_nodereclaim", 794 "workingset_nodereclaim",
779 "nr_anon_transparent_hugepages", 795 "nr_anon_transparent_hugepages",
780 "nr_free_cma", 796 "nr_free_cma",
797
798 /* enum writeback_stat_item counters */
781 "nr_dirty_threshold", 799 "nr_dirty_threshold",
782 "nr_dirty_background_threshold", 800 "nr_dirty_background_threshold",
783 801
784#ifdef CONFIG_VM_EVENT_COUNTERS 802#ifdef CONFIG_VM_EVENT_COUNTERS
803 /* enum vm_event_item counters */
785 "pgpgin", 804 "pgpgin",
786 "pgpgout", 805 "pgpgout",
787 "pswpin", 806 "pswpin",
@@ -860,6 +879,13 @@ const char * const vmstat_text[] = {
860 "thp_zero_page_alloc", 879 "thp_zero_page_alloc",
861 "thp_zero_page_alloc_failed", 880 "thp_zero_page_alloc_failed",
862#endif 881#endif
882#ifdef CONFIG_MEMORY_BALLOON
883 "balloon_inflate",
884 "balloon_deflate",
885#ifdef CONFIG_BALLOON_COMPACTION
886 "balloon_migrate",
887#endif
888#endif /* CONFIG_MEMORY_BALLOON */
863#ifdef CONFIG_DEBUG_TLBFLUSH 889#ifdef CONFIG_DEBUG_TLBFLUSH
864#ifdef CONFIG_SMP 890#ifdef CONFIG_SMP
865 "nr_tlb_remote_flush", 891 "nr_tlb_remote_flush",
@@ -1229,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = {
1229#ifdef CONFIG_SMP 1255#ifdef CONFIG_SMP
1230static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1256static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1231int sysctl_stat_interval __read_mostly = HZ; 1257int sysctl_stat_interval __read_mostly = HZ;
1258static cpumask_var_t cpu_stat_off;
1232 1259
1233static void vmstat_update(struct work_struct *w) 1260static void vmstat_update(struct work_struct *w)
1234{ 1261{
1235 refresh_cpu_vm_stats(); 1262 if (refresh_cpu_vm_stats())
1236 schedule_delayed_work(this_cpu_ptr(&vmstat_work), 1263 /*
1264 * Counters were updated so we expect more updates
1265 * to occur in the future. Keep on running the
1266 * update worker thread.
1267 */
1268 schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1269 round_jiffies_relative(sysctl_stat_interval));
1270 else {
1271 /*
1272 * We did not update any counters so the app may be in
1273 * a mode where it does not cause counter updates.
1274 * We may be uselessly running vmstat_update.
1275 * Defer the checking for differentials to the
1276 * shepherd thread on a different processor.
1277 */
1278 int r;
1279 /*
1280 * Shepherd work thread does not race since it never
1281 * changes the bit if its zero but the cpu
1282 * online / off line code may race if
1283 * worker threads are still allowed during
1284 * shutdown / startup.
1285 */
1286 r = cpumask_test_and_set_cpu(smp_processor_id(),
1287 cpu_stat_off);
1288 VM_BUG_ON(r);
1289 }
1290}
1291
1292/*
1293 * Check if the diffs for a certain cpu indicate that
1294 * an update is needed.
1295 */
1296static bool need_update(int cpu)
1297{
1298 struct zone *zone;
1299
1300 for_each_populated_zone(zone) {
1301 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1302
1303 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1304 /*
1305 * The fast way of checking if there are any vmstat diffs.
1306 * This works because the diffs are byte sized items.
1307 */
1308 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1309 return true;
1310
1311 }
1312 return false;
1313}
1314
1315
1316/*
1317 * Shepherd worker thread that checks the
1318 * differentials of processors that have their worker
1319 * threads for vm statistics updates disabled because of
1320 * inactivity.
1321 */
1322static void vmstat_shepherd(struct work_struct *w);
1323
1324static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
1325
1326static void vmstat_shepherd(struct work_struct *w)
1327{
1328 int cpu;
1329
1330 get_online_cpus();
1331 /* Check processors whose vmstat worker threads have been disabled */
1332 for_each_cpu(cpu, cpu_stat_off)
1333 if (need_update(cpu) &&
1334 cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1335
1336 schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
1337 __round_jiffies_relative(sysctl_stat_interval, cpu));
1338
1339 put_online_cpus();
1340
1341 schedule_delayed_work(&shepherd,
1237 round_jiffies_relative(sysctl_stat_interval)); 1342 round_jiffies_relative(sysctl_stat_interval));
1343
1238} 1344}
1239 1345
1240static void start_cpu_timer(int cpu) 1346static void __init start_shepherd_timer(void)
1241{ 1347{
1242 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1348 int cpu;
1349
1350 for_each_possible_cpu(cpu)
1351 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1352 vmstat_update);
1353
1354 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
1355 BUG();
1356 cpumask_copy(cpu_stat_off, cpu_online_mask);
1243 1357
1244 INIT_DEFERRABLE_WORK(work, vmstat_update); 1358 schedule_delayed_work(&shepherd,
1245 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1359 round_jiffies_relative(sysctl_stat_interval));
1246} 1360}
1247 1361
1248static void vmstat_cpu_dead(int node) 1362static void vmstat_cpu_dead(int node)
@@ -1273,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
1273 case CPU_ONLINE: 1387 case CPU_ONLINE:
1274 case CPU_ONLINE_FROZEN: 1388 case CPU_ONLINE_FROZEN:
1275 refresh_zone_stat_thresholds(); 1389 refresh_zone_stat_thresholds();
1276 start_cpu_timer(cpu);
1277 node_set_state(cpu_to_node(cpu), N_CPU); 1390 node_set_state(cpu_to_node(cpu), N_CPU);
1391 cpumask_set_cpu(cpu, cpu_stat_off);
1278 break; 1392 break;
1279 case CPU_DOWN_PREPARE: 1393 case CPU_DOWN_PREPARE:
1280 case CPU_DOWN_PREPARE_FROZEN: 1394 case CPU_DOWN_PREPARE_FROZEN:
1281 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); 1395 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1282 per_cpu(vmstat_work, cpu).work.func = NULL; 1396 cpumask_clear_cpu(cpu, cpu_stat_off);
1283 break; 1397 break;
1284 case CPU_DOWN_FAILED: 1398 case CPU_DOWN_FAILED:
1285 case CPU_DOWN_FAILED_FROZEN: 1399 case CPU_DOWN_FAILED_FROZEN:
1286 start_cpu_timer(cpu); 1400 cpumask_set_cpu(cpu, cpu_stat_off);
1287 break; 1401 break;
1288 case CPU_DEAD: 1402 case CPU_DEAD:
1289 case CPU_DEAD_FROZEN: 1403 case CPU_DEAD_FROZEN:
@@ -1303,15 +1417,10 @@ static struct notifier_block vmstat_notifier =
1303static int __init setup_vmstat(void) 1417static int __init setup_vmstat(void)
1304{ 1418{
1305#ifdef CONFIG_SMP 1419#ifdef CONFIG_SMP
1306 int cpu;
1307
1308 cpu_notifier_register_begin(); 1420 cpu_notifier_register_begin();
1309 __register_cpu_notifier(&vmstat_notifier); 1421 __register_cpu_notifier(&vmstat_notifier);
1310 1422
1311 for_each_online_cpu(cpu) { 1423 start_shepherd_timer();
1312 start_cpu_timer(cpu);
1313 node_set_state(cpu_to_node(cpu), N_CPU);
1314 }
1315 cpu_notifier_register_done(); 1424 cpu_notifier_register_done();
1316#endif 1425#endif
1317#ifdef CONFIG_PROC_FS 1426#ifdef CONFIG_PROC_FS
diff --git a/mm/zbud.c b/mm/zbud.c
index a05790b1915e..ecf1dbef6983 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -60,15 +60,17 @@
60 * NCHUNKS_ORDER determines the internal allocation granularity, effectively 60 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
61 * adjusting internal fragmentation. It also determines the number of 61 * adjusting internal fragmentation. It also determines the number of
62 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the 62 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
63 * allocation granularity will be in chunks of size PAGE_SIZE/64, and there 63 * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
64 * will be 64 freelists per pool. 64 * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
65 * 63 which shows the max number of free chunks in zbud page, also there will be
66 * 63 freelists per pool.
65 */ 67 */
66#define NCHUNKS_ORDER 6 68#define NCHUNKS_ORDER 6
67 69
68#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) 70#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
69#define CHUNK_SIZE (1 << CHUNK_SHIFT) 71#define CHUNK_SIZE (1 << CHUNK_SHIFT)
70#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
71#define ZHDR_SIZE_ALIGNED CHUNK_SIZE 72#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
73#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
72 74
73/** 75/**
74 * struct zbud_pool - stores metadata for each zbud pool 76 * struct zbud_pool - stores metadata for each zbud pool
@@ -195,6 +197,7 @@ static struct zpool_driver zbud_zpool_driver = {
195 .total_size = zbud_zpool_total_size, 197 .total_size = zbud_zpool_total_size,
196}; 198};
197 199
200MODULE_ALIAS("zpool-zbud");
198#endif /* CONFIG_ZPOOL */ 201#endif /* CONFIG_ZPOOL */
199 202
200/***************** 203/*****************
@@ -267,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr)
267{ 270{
268 /* 271 /*
269 * Rather than branch for different situations, just use the fact that 272 * Rather than branch for different situations, just use the fact that
270 * free buddies have a length of zero to simplify everything. -1 at the 273 * free buddies have a length of zero to simplify everything.
271 * end for the zbud header.
272 */ 274 */
273 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; 275 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
274} 276}
275 277
276/***************** 278/*****************
diff --git a/mm/zpool.c b/mm/zpool.c
index e40612a1df00..739cdf0d183a 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -150,7 +150,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
150 driver = zpool_get_driver(type); 150 driver = zpool_get_driver(type);
151 151
152 if (!driver) { 152 if (!driver) {
153 request_module(type); 153 request_module("zpool-%s", type);
154 driver = zpool_get_driver(type); 154 driver = zpool_get_driver(type);
155 } 155 }
156 156
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 4e2fc83cb394..839a48c3ca27 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -175,7 +175,7 @@ enum fullness_group {
175 * n <= N / f, where 175 * n <= N / f, where
176 * n = number of allocated objects 176 * n = number of allocated objects
177 * N = total number of objects zspage can store 177 * N = total number of objects zspage can store
178 * f = 1/fullness_threshold_frac 178 * f = fullness_threshold_frac
179 * 179 *
180 * Similarly, we assign zspage to: 180 * Similarly, we assign zspage to:
181 * ZS_ALMOST_FULL when n > N / f 181 * ZS_ALMOST_FULL when n > N / f
@@ -199,9 +199,6 @@ struct size_class {
199 199
200 spinlock_t lock; 200 spinlock_t lock;
201 201
202 /* stats */
203 u64 pages_allocated;
204
205 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 202 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
206}; 203};
207 204
@@ -220,6 +217,7 @@ struct zs_pool {
220 struct size_class size_class[ZS_SIZE_CLASSES]; 217 struct size_class size_class[ZS_SIZE_CLASSES];
221 218
222 gfp_t flags; /* allocation flags used when growing pool */ 219 gfp_t flags; /* allocation flags used when growing pool */
220 atomic_long_t pages_allocated;
223}; 221};
224 222
225/* 223/*
@@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
299 297
300static u64 zs_zpool_total_size(void *pool) 298static u64 zs_zpool_total_size(void *pool)
301{ 299{
302 return zs_get_total_size_bytes(pool); 300 return zs_get_total_pages(pool) << PAGE_SHIFT;
303} 301}
304 302
305static struct zpool_driver zs_zpool_driver = { 303static struct zpool_driver zs_zpool_driver = {
@@ -315,6 +313,7 @@ static struct zpool_driver zs_zpool_driver = {
315 .total_size = zs_zpool_total_size, 313 .total_size = zs_zpool_total_size,
316}; 314};
317 315
316MODULE_ALIAS("zpool-zsmalloc");
318#endif /* CONFIG_ZPOOL */ 317#endif /* CONFIG_ZPOOL */
319 318
320/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 319/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
@@ -629,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
629 while (page) { 628 while (page) {
630 struct page *next_page; 629 struct page *next_page;
631 struct link_free *link; 630 struct link_free *link;
632 unsigned int i, objs_on_page; 631 unsigned int i = 1;
633 632
634 /* 633 /*
635 * page->index stores offset of first object starting 634 * page->index stores offset of first object starting
@@ -642,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class)
642 641
643 link = (struct link_free *)kmap_atomic(page) + 642 link = (struct link_free *)kmap_atomic(page) +
644 off / sizeof(*link); 643 off / sizeof(*link);
645 objs_on_page = (PAGE_SIZE - off) / class->size;
646 644
647 for (i = 1; i <= objs_on_page; i++) { 645 while ((off += class->size) < PAGE_SIZE) {
648 off += class->size; 646 link->next = obj_location_to_handle(page, i++);
649 if (off < PAGE_SIZE) { 647 link += class->size / sizeof(*link);
650 link->next = obj_location_to_handle(page, i);
651 link += class->size / sizeof(*link);
652 }
653 } 648 }
654 649
655 /* 650 /*
@@ -661,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
661 link->next = obj_location_to_handle(next_page, 0); 656 link->next = obj_location_to_handle(next_page, 0);
662 kunmap_atomic(link); 657 kunmap_atomic(link);
663 page = next_page; 658 page = next_page;
664 off = (off + class->size) % PAGE_SIZE; 659 off %= PAGE_SIZE;
665 } 660 }
666} 661}
667 662
@@ -1027,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1027 return 0; 1022 return 0;
1028 1023
1029 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1024 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
1025 atomic_long_add(class->pages_per_zspage,
1026 &pool->pages_allocated);
1030 spin_lock(&class->lock); 1027 spin_lock(&class->lock);
1031 class->pages_allocated += class->pages_per_zspage;
1032 } 1028 }
1033 1029
1034 obj = (unsigned long)first_page->freelist; 1030 obj = (unsigned long)first_page->freelist;
@@ -1081,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1081 1077
1082 first_page->inuse--; 1078 first_page->inuse--;
1083 fullness = fix_fullness_group(pool, first_page); 1079 fullness = fix_fullness_group(pool, first_page);
1084
1085 if (fullness == ZS_EMPTY)
1086 class->pages_allocated -= class->pages_per_zspage;
1087
1088 spin_unlock(&class->lock); 1080 spin_unlock(&class->lock);
1089 1081
1090 if (fullness == ZS_EMPTY) 1082 if (fullness == ZS_EMPTY) {
1083 atomic_long_sub(class->pages_per_zspage,
1084 &pool->pages_allocated);
1091 free_zspage(first_page); 1085 free_zspage(first_page);
1086 }
1092} 1087}
1093EXPORT_SYMBOL_GPL(zs_free); 1088EXPORT_SYMBOL_GPL(zs_free);
1094 1089
@@ -1182,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1182} 1177}
1183EXPORT_SYMBOL_GPL(zs_unmap_object); 1178EXPORT_SYMBOL_GPL(zs_unmap_object);
1184 1179
1185u64 zs_get_total_size_bytes(struct zs_pool *pool) 1180unsigned long zs_get_total_pages(struct zs_pool *pool)
1186{ 1181{
1187 int i; 1182 return atomic_long_read(&pool->pages_allocated);
1188 u64 npages = 0;
1189
1190 for (i = 0; i < ZS_SIZE_CLASSES; i++)
1191 npages += pool->size_class[i].pages_allocated;
1192
1193 return npages << PAGE_SHIFT;
1194} 1183}
1195EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); 1184EXPORT_SYMBOL_GPL(zs_get_total_pages);
1196 1185
1197module_init(zs_init); 1186module_init(zs_init);
1198module_exit(zs_exit); 1187module_exit(zs_exit);