aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2012-10-28 14:28:52 -0400
committerJiri Kosina <jkosina@suse.cz>2012-10-28 14:29:19 -0400
commit3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a (patch)
tree0058693cc9e70b7461dae551f8a19aff2efd13ca /mm
parentf16f84937d769c893492160b1a8c3672e3992beb (diff)
parente657e078d3dfa9f96976db7a2b5fd7d7c9f1f1a6 (diff)
Merge branch 'master' into for-next
Sync up with Linus' tree to be able to apply Cesar's patch against newer version of the code. Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c50
-rw-r--r--mm/bootmem.c10
-rw-r--r--mm/compaction.c562
-rw-r--r--mm/fadvise.c34
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/filemap_xip.c10
-rw-r--r--mm/fremap.c19
-rw-r--r--mm/frontswap.c34
-rw-r--r--mm/huge_memory.c441
-rw-r--r--mm/hugetlb.c34
-rw-r--r--mm/internal.h52
-rw-r--r--mm/interval_tree.c112
-rw-r--r--mm/kmemleak.c106
-rw-r--r--mm/ksm.c40
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memblock.c29
-rw-r--r--mm/memcontrol.c29
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c115
-rw-r--r--mm/memory_hotplug.c77
-rw-r--r--mm/mempolicy.c153
-rw-r--r--mm/mlock.c27
-rw-r--r--mm/mmap.c210
-rw-r--r--mm/mmu_notifier.c89
-rw-r--r--mm/mremap.c73
-rw-r--r--mm/nobootmem.c5
-rw-r--r--mm/nommu.c39
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c14
-rw-r--r--mm/page_alloc.c319
-rw-r--r--mm/page_isolation.c43
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgtable-generic.c50
-rw-r--r--mm/prio_tree.c208
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/rmap.c179
-rw-r--r--mm/shmem.c180
-rw-r--r--mm/slab.c350
-rw-r--r--mm/slab.h19
-rw-r--r--mm/slab_common.c162
-rw-r--r--mm/slob.c97
-rw-r--r--mm/slub.c208
-rw-r--r--mm/swap.c13
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/util.c35
-rw-r--r--mm/vmalloc.c5
-rw-r--r--mm/vmscan.c111
-rw-r--r--mm/vmstat.c16
51 files changed, 2446 insertions, 1976 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..a3f8dddaaab3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
191# support for memory compaction 191# support for memory compaction
192config COMPACTION 192config COMPACTION
193 bool "Allow for memory compaction" 193 bool "Allow for memory compaction"
194 def_bool y
194 select MIGRATION 195 select MIGRATION
195 depends on MMU 196 depends on MMU
196 help 197 help
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
318 319
319config TRANSPARENT_HUGEPAGE 320config TRANSPARENT_HUGEPAGE
320 bool "Transparent Hugepage Support" 321 bool "Transparent Hugepage Support"
321 depends on X86 && MMU 322 depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
322 select COMPACTION 323 select COMPACTION
323 help 324 help
324 Transparent Hugepages allows the kernel to use huge pages and 325 Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..6b025f80af34 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
15 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o $(mmu-y) 19 compaction.o interval_tree.o $(mmu-y)
20 20
21obj-y += init-mm.o 21obj-y += init-mm.o
22 22
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b41823cc05e6..d3ca2b3ee176 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -158,16 +158,16 @@ static ssize_t read_ahead_kb_store(struct device *dev,
158 const char *buf, size_t count) 158 const char *buf, size_t count)
159{ 159{
160 struct backing_dev_info *bdi = dev_get_drvdata(dev); 160 struct backing_dev_info *bdi = dev_get_drvdata(dev);
161 char *end;
162 unsigned long read_ahead_kb; 161 unsigned long read_ahead_kb;
163 ssize_t ret = -EINVAL; 162 ssize_t ret;
164 163
165 read_ahead_kb = simple_strtoul(buf, &end, 10); 164 ret = kstrtoul(buf, 10, &read_ahead_kb);
166 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { 165 if (ret < 0)
167 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); 166 return ret;
168 ret = count; 167
169 } 168 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
170 return ret; 169
170 return count;
171} 171}
172 172
173#define K(pages) ((pages) << (PAGE_SHIFT - 10)) 173#define K(pages) ((pages) << (PAGE_SHIFT - 10))
@@ -187,16 +187,17 @@ static ssize_t min_ratio_store(struct device *dev,
187 struct device_attribute *attr, const char *buf, size_t count) 187 struct device_attribute *attr, const char *buf, size_t count)
188{ 188{
189 struct backing_dev_info *bdi = dev_get_drvdata(dev); 189 struct backing_dev_info *bdi = dev_get_drvdata(dev);
190 char *end;
191 unsigned int ratio; 190 unsigned int ratio;
192 ssize_t ret = -EINVAL; 191 ssize_t ret;
192
193 ret = kstrtouint(buf, 10, &ratio);
194 if (ret < 0)
195 return ret;
196
197 ret = bdi_set_min_ratio(bdi, ratio);
198 if (!ret)
199 ret = count;
193 200
194 ratio = simple_strtoul(buf, &end, 10);
195 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
196 ret = bdi_set_min_ratio(bdi, ratio);
197 if (!ret)
198 ret = count;
199 }
200 return ret; 201 return ret;
201} 202}
202BDI_SHOW(min_ratio, bdi->min_ratio) 203BDI_SHOW(min_ratio, bdi->min_ratio)
@@ -205,16 +206,17 @@ static ssize_t max_ratio_store(struct device *dev,
205 struct device_attribute *attr, const char *buf, size_t count) 206 struct device_attribute *attr, const char *buf, size_t count)
206{ 207{
207 struct backing_dev_info *bdi = dev_get_drvdata(dev); 208 struct backing_dev_info *bdi = dev_get_drvdata(dev);
208 char *end;
209 unsigned int ratio; 209 unsigned int ratio;
210 ssize_t ret = -EINVAL; 210 ssize_t ret;
211
212 ret = kstrtouint(buf, 10, &ratio);
213 if (ret < 0)
214 return ret;
215
216 ret = bdi_set_max_ratio(bdi, ratio);
217 if (!ret)
218 ret = count;
211 219
212 ratio = simple_strtoul(buf, &end, 10);
213 if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
214 ret = bdi_set_max_ratio(bdi, ratio);
215 if (!ret)
216 ret = count;
217 }
218 return ret; 220 return ret;
219} 221}
220BDI_SHOW(max_ratio, bdi->max_ratio) 222BDI_SHOW(max_ratio, bdi->max_ratio)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..434be4ae7a04 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
198 int order = ilog2(BITS_PER_LONG); 198 int order = ilog2(BITS_PER_LONG);
199 199
200 __free_pages_bootmem(pfn_to_page(start), order); 200 __free_pages_bootmem(pfn_to_page(start), order);
201 fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
202 start, start + BITS_PER_LONG);
201 count += BITS_PER_LONG; 203 count += BITS_PER_LONG;
202 start += BITS_PER_LONG; 204 start += BITS_PER_LONG;
203 } else { 205 } else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
208 if (vec & 1) { 210 if (vec & 1) {
209 page = pfn_to_page(start + off); 211 page = pfn_to_page(start + off);
210 __free_pages_bootmem(page, 0); 212 __free_pages_bootmem(page, 0);
213 fixup_zone_present_pages(
214 page_to_nid(page),
215 start + off, start + off + 1);
211 count++; 216 count++;
212 } 217 }
213 vec >>= 1; 218 vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
221 pages = bdata->node_low_pfn - bdata->node_min_pfn; 226 pages = bdata->node_low_pfn - bdata->node_min_pfn;
222 pages = bootmem_bootmap_pages(pages); 227 pages = bootmem_bootmap_pages(pages);
223 count += pages; 228 count += pages;
224 while (pages--) 229 while (pages--) {
230 fixup_zone_present_pages(page_to_nid(page),
231 page_to_pfn(page), page_to_pfn(page) + 1);
225 __free_pages_bootmem(page++, 0); 232 __free_pages_bootmem(page++, 0);
233 }
226 234
227 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); 235 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
228 236
diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..9eef55838fca 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
51} 51}
52 52
53#ifdef CONFIG_COMPACTION
54/* Returns true if the pageblock should be scanned for pages to isolate. */
55static inline bool isolation_suitable(struct compact_control *cc,
56 struct page *page)
57{
58 if (cc->ignore_skip_hint)
59 return true;
60
61 return !get_pageblock_skip(page);
62}
63
64/*
65 * This function is called to clear all cached information on pageblocks that
66 * should be skipped for page isolation when the migrate and free page scanner
67 * meet.
68 */
69static void __reset_isolation_suitable(struct zone *zone)
70{
71 unsigned long start_pfn = zone->zone_start_pfn;
72 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
73 unsigned long pfn;
74
75 zone->compact_cached_migrate_pfn = start_pfn;
76 zone->compact_cached_free_pfn = end_pfn;
77 zone->compact_blockskip_flush = false;
78
79 /* Walk the zone and mark every pageblock as suitable for isolation */
80 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
81 struct page *page;
82
83 cond_resched();
84
85 if (!pfn_valid(pfn))
86 continue;
87
88 page = pfn_to_page(pfn);
89 if (zone != page_zone(page))
90 continue;
91
92 clear_pageblock_skip(page);
93 }
94}
95
96void reset_isolation_suitable(pg_data_t *pgdat)
97{
98 int zoneid;
99
100 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
101 struct zone *zone = &pgdat->node_zones[zoneid];
102 if (!populated_zone(zone))
103 continue;
104
105 /* Only flush if a full compaction finished recently */
106 if (zone->compact_blockskip_flush)
107 __reset_isolation_suitable(zone);
108 }
109}
110
111/*
112 * If no pages were isolated then mark this pageblock to be skipped in the
113 * future. The information is later cleared by __reset_isolation_suitable().
114 */
115static void update_pageblock_skip(struct compact_control *cc,
116 struct page *page, unsigned long nr_isolated,
117 bool migrate_scanner)
118{
119 struct zone *zone = cc->zone;
120 if (!page)
121 return;
122
123 if (!nr_isolated) {
124 unsigned long pfn = page_to_pfn(page);
125 set_pageblock_skip(page);
126
127 /* Update where compaction should restart */
128 if (migrate_scanner) {
129 if (!cc->finished_update_migrate &&
130 pfn > zone->compact_cached_migrate_pfn)
131 zone->compact_cached_migrate_pfn = pfn;
132 } else {
133 if (!cc->finished_update_free &&
134 pfn < zone->compact_cached_free_pfn)
135 zone->compact_cached_free_pfn = pfn;
136 }
137 }
138}
139#else
140static inline bool isolation_suitable(struct compact_control *cc,
141 struct page *page)
142{
143 return true;
144}
145
146static void update_pageblock_skip(struct compact_control *cc,
147 struct page *page, unsigned long nr_isolated,
148 bool migrate_scanner)
149{
150}
151#endif /* CONFIG_COMPACTION */
152
153static inline bool should_release_lock(spinlock_t *lock)
154{
155 return need_resched() || spin_is_contended(lock);
156}
157
53/* 158/*
54 * Compaction requires the taking of some coarse locks that are potentially 159 * Compaction requires the taking of some coarse locks that are potentially
55 * very heavily contended. Check if the process needs to be scheduled or 160 * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
62static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, 167static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
63 bool locked, struct compact_control *cc) 168 bool locked, struct compact_control *cc)
64{ 169{
65 if (need_resched() || spin_is_contended(lock)) { 170 if (should_release_lock(lock)) {
66 if (locked) { 171 if (locked) {
67 spin_unlock_irqrestore(lock, *flags); 172 spin_unlock_irqrestore(lock, *flags);
68 locked = false; 173 locked = false;
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
70 175
71 /* async aborts if taking too long or contended */ 176 /* async aborts if taking too long or contended */
72 if (!cc->sync) { 177 if (!cc->sync) {
73 if (cc->contended) 178 cc->contended = true;
74 *cc->contended = true;
75 return false; 179 return false;
76 } 180 }
77 181
78 cond_resched(); 182 cond_resched();
79 if (fatal_signal_pending(current))
80 return false;
81 } 183 }
82 184
83 if (!locked) 185 if (!locked)
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
91 return compact_checklock_irqsave(lock, flags, false, cc); 193 return compact_checklock_irqsave(lock, flags, false, cc);
92} 194}
93 195
196/* Returns true if the page is within a block suitable for migration to */
197static bool suitable_migration_target(struct page *page)
198{
199 int migratetype = get_pageblock_migratetype(page);
200
201 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
202 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
203 return false;
204
205 /* If the page is a large free page, then allow migration */
206 if (PageBuddy(page) && page_order(page) >= pageblock_order)
207 return true;
208
209 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
210 if (migrate_async_suitable(migratetype))
211 return true;
212
213 /* Otherwise skip the block */
214 return false;
215}
216
217static void compact_capture_page(struct compact_control *cc)
218{
219 unsigned long flags;
220 int mtype, mtype_low, mtype_high;
221
222 if (!cc->page || *cc->page)
223 return;
224
225 /*
226 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
227 * regardless of the migratetype of the freelist is is captured from.
228 * This is fine because the order for a high-order MIGRATE_MOVABLE
229 * allocation is typically at least a pageblock size and overall
230 * fragmentation is not impaired. Other allocation types must
231 * capture pages from their own migratelist because otherwise they
232 * could pollute other pageblocks like MIGRATE_MOVABLE with
233 * difficult to move pages and making fragmentation worse overall.
234 */
235 if (cc->migratetype == MIGRATE_MOVABLE) {
236 mtype_low = 0;
237 mtype_high = MIGRATE_PCPTYPES;
238 } else {
239 mtype_low = cc->migratetype;
240 mtype_high = cc->migratetype + 1;
241 }
242
243 /* Speculatively examine the free lists without zone lock */
244 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
245 int order;
246 for (order = cc->order; order < MAX_ORDER; order++) {
247 struct page *page;
248 struct free_area *area;
249 area = &(cc->zone->free_area[order]);
250 if (list_empty(&area->free_list[mtype]))
251 continue;
252
253 /* Take the lock and attempt capture of the page */
254 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
255 return;
256 if (!list_empty(&area->free_list[mtype])) {
257 page = list_entry(area->free_list[mtype].next,
258 struct page, lru);
259 if (capture_free_page(page, cc->order, mtype)) {
260 spin_unlock_irqrestore(&cc->zone->lock,
261 flags);
262 *cc->page = page;
263 return;
264 }
265 }
266 spin_unlock_irqrestore(&cc->zone->lock, flags);
267 }
268 }
269}
270
94/* 271/*
95 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 272 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
96 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 273 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
97 * pages inside of the pageblock (even though it may still end up isolating 274 * pages inside of the pageblock (even though it may still end up isolating
98 * some pages). 275 * some pages).
99 */ 276 */
100static unsigned long isolate_freepages_block(unsigned long blockpfn, 277static unsigned long isolate_freepages_block(struct compact_control *cc,
278 unsigned long blockpfn,
101 unsigned long end_pfn, 279 unsigned long end_pfn,
102 struct list_head *freelist, 280 struct list_head *freelist,
103 bool strict) 281 bool strict)
104{ 282{
105 int nr_scanned = 0, total_isolated = 0; 283 int nr_scanned = 0, total_isolated = 0;
106 struct page *cursor; 284 struct page *cursor, *valid_page = NULL;
285 unsigned long nr_strict_required = end_pfn - blockpfn;
286 unsigned long flags;
287 bool locked = false;
107 288
108 cursor = pfn_to_page(blockpfn); 289 cursor = pfn_to_page(blockpfn);
109 290
110 /* Isolate free pages. This assumes the block is valid */ 291 /* Isolate free pages. */
111 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 292 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
112 int isolated, i; 293 int isolated, i;
113 struct page *page = cursor; 294 struct page *page = cursor;
114 295
115 if (!pfn_valid_within(blockpfn)) {
116 if (strict)
117 return 0;
118 continue;
119 }
120 nr_scanned++; 296 nr_scanned++;
297 if (!pfn_valid_within(blockpfn))
298 continue;
299 if (!valid_page)
300 valid_page = page;
301 if (!PageBuddy(page))
302 continue;
121 303
122 if (!PageBuddy(page)) { 304 /*
123 if (strict) 305 * The zone lock must be held to isolate freepages.
124 return 0; 306 * Unfortunately this is a very coarse lock and can be
307 * heavily contended if there are parallel allocations
308 * or parallel compactions. For async compaction do not
309 * spin on the lock and we acquire the lock as late as
310 * possible.
311 */
312 locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
313 locked, cc);
314 if (!locked)
315 break;
316
317 /* Recheck this is a suitable migration target under lock */
318 if (!strict && !suitable_migration_target(page))
319 break;
320
321 /* Recheck this is a buddy page under lock */
322 if (!PageBuddy(page))
125 continue; 323 continue;
126 }
127 324
128 /* Found a free page, break it into order-0 pages */ 325 /* Found a free page, break it into order-0 pages */
129 isolated = split_free_page(page); 326 isolated = split_free_page(page);
130 if (!isolated && strict) 327 if (!isolated && strict)
131 return 0; 328 break;
132 total_isolated += isolated; 329 total_isolated += isolated;
133 for (i = 0; i < isolated; i++) { 330 for (i = 0; i < isolated; i++) {
134 list_add(&page->lru, freelist); 331 list_add(&page->lru, freelist);
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
143 } 340 }
144 341
145 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 342 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
343
344 /*
345 * If strict isolation is requested by CMA then check that all the
346 * pages requested were isolated. If there were any failures, 0 is
347 * returned and CMA will fail.
348 */
349 if (strict && nr_strict_required > total_isolated)
350 total_isolated = 0;
351
352 if (locked)
353 spin_unlock_irqrestore(&cc->zone->lock, flags);
354
355 /* Update the pageblock-skip if the whole pageblock was scanned */
356 if (blockpfn == end_pfn)
357 update_pageblock_skip(cc, valid_page, total_isolated, false);
358
146 return total_isolated; 359 return total_isolated;
147} 360}
148 361
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
160 * a free page). 373 * a free page).
161 */ 374 */
162unsigned long 375unsigned long
163isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) 376isolate_freepages_range(struct compact_control *cc,
377 unsigned long start_pfn, unsigned long end_pfn)
164{ 378{
165 unsigned long isolated, pfn, block_end_pfn, flags; 379 unsigned long isolated, pfn, block_end_pfn;
166 struct zone *zone = NULL;
167 LIST_HEAD(freelist); 380 LIST_HEAD(freelist);
168 381
169 if (pfn_valid(start_pfn))
170 zone = page_zone(pfn_to_page(start_pfn));
171
172 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { 382 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
173 if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) 383 if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
174 break; 384 break;
175 385
176 /* 386 /*
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
180 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 390 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
181 block_end_pfn = min(block_end_pfn, end_pfn); 391 block_end_pfn = min(block_end_pfn, end_pfn);
182 392
183 spin_lock_irqsave(&zone->lock, flags); 393 isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
184 isolated = isolate_freepages_block(pfn, block_end_pfn,
185 &freelist, true); 394 &freelist, true);
186 spin_unlock_irqrestore(&zone->lock, flags);
187 395
188 /* 396 /*
189 * In strict mode, isolate_freepages_block() returns 0 if 397 * In strict mode, isolate_freepages_block() returns 0 if
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
253 * @cc: Compaction control structure. 461 * @cc: Compaction control structure.
254 * @low_pfn: The first PFN of the range. 462 * @low_pfn: The first PFN of the range.
255 * @end_pfn: The one-past-the-last PFN of the range. 463 * @end_pfn: The one-past-the-last PFN of the range.
464 * @unevictable: true if it allows to isolate unevictable pages
256 * 465 *
257 * Isolate all pages that can be migrated from the range specified by 466 * Isolate all pages that can be migrated from the range specified by
258 * [low_pfn, end_pfn). Returns zero if there is a fatal signal 467 * [low_pfn, end_pfn). Returns zero if there is a fatal signal
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
268 */ 477 */
269unsigned long 478unsigned long
270isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 479isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
271 unsigned long low_pfn, unsigned long end_pfn) 480 unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
272{ 481{
273 unsigned long last_pageblock_nr = 0, pageblock_nr; 482 unsigned long last_pageblock_nr = 0, pageblock_nr;
274 unsigned long nr_scanned = 0, nr_isolated = 0; 483 unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
276 isolate_mode_t mode = 0; 485 isolate_mode_t mode = 0;
277 struct lruvec *lruvec; 486 struct lruvec *lruvec;
278 unsigned long flags; 487 unsigned long flags;
279 bool locked; 488 bool locked = false;
489 struct page *page = NULL, *valid_page = NULL;
280 490
281 /* 491 /*
282 * Ensure that there are not too many pages isolated from the LRU 492 * Ensure that there are not too many pages isolated from the LRU
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
296 506
297 /* Time to isolate some pages for migration */ 507 /* Time to isolate some pages for migration */
298 cond_resched(); 508 cond_resched();
299 spin_lock_irqsave(&zone->lru_lock, flags);
300 locked = true;
301 for (; low_pfn < end_pfn; low_pfn++) { 509 for (; low_pfn < end_pfn; low_pfn++) {
302 struct page *page;
303
304 /* give a chance to irqs before checking need_resched() */ 510 /* give a chance to irqs before checking need_resched() */
305 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { 511 if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
306 spin_unlock_irqrestore(&zone->lru_lock, flags); 512 if (should_release_lock(&zone->lru_lock)) {
307 locked = false; 513 spin_unlock_irqrestore(&zone->lru_lock, flags);
514 locked = false;
515 }
308 } 516 }
309 517
310 /* Check if it is ok to still hold the lock */
311 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
312 locked, cc);
313 if (!locked)
314 break;
315
316 /* 518 /*
317 * migrate_pfn does not necessarily start aligned to a 519 * migrate_pfn does not necessarily start aligned to a
318 * pageblock. Ensure that pfn_valid is called when moving 520 * pageblock. Ensure that pfn_valid is called when moving
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
340 if (page_zone(page) != zone) 542 if (page_zone(page) != zone)
341 continue; 543 continue;
342 544
545 if (!valid_page)
546 valid_page = page;
547
548 /* If isolation recently failed, do not retry */
549 pageblock_nr = low_pfn >> pageblock_order;
550 if (!isolation_suitable(cc, page))
551 goto next_pageblock;
552
343 /* Skip if free */ 553 /* Skip if free */
344 if (PageBuddy(page)) 554 if (PageBuddy(page))
345 continue; 555 continue;
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
349 * migration is optimistic to see if the minimum amount of work 559 * migration is optimistic to see if the minimum amount of work
350 * satisfies the allocation 560 * satisfies the allocation
351 */ 561 */
352 pageblock_nr = low_pfn >> pageblock_order;
353 if (!cc->sync && last_pageblock_nr != pageblock_nr && 562 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
354 !migrate_async_suitable(get_pageblock_migratetype(page))) { 563 !migrate_async_suitable(get_pageblock_migratetype(page))) {
355 low_pfn += pageblock_nr_pages; 564 cc->finished_update_migrate = true;
356 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; 565 goto next_pageblock;
357 last_pageblock_nr = pageblock_nr;
358 continue;
359 } 566 }
360 567
568 /* Check may be lockless but that's ok as we recheck later */
361 if (!PageLRU(page)) 569 if (!PageLRU(page))
362 continue; 570 continue;
363 571
364 /* 572 /*
365 * PageLRU is set, and lru_lock excludes isolation, 573 * PageLRU is set. lru_lock normally excludes isolation
366 * splitting and collapsing (collapsing has already 574 * splitting and collapsing (collapsing has already happened
367 * happened if PageLRU is set). 575 * if PageLRU is set) but the lock is not necessarily taken
576 * here and it is wasteful to take it just to check transhuge.
577 * Check TransHuge without lock and skip the whole pageblock if
578 * it's either a transhuge or hugetlbfs page, as calling
579 * compound_order() without preventing THP from splitting the
580 * page underneath us may return surprising results.
368 */ 581 */
369 if (PageTransHuge(page)) { 582 if (PageTransHuge(page)) {
583 if (!locked)
584 goto next_pageblock;
585 low_pfn += (1 << compound_order(page)) - 1;
586 continue;
587 }
588
589 /* Check if it is ok to still hold the lock */
590 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
591 locked, cc);
592 if (!locked || fatal_signal_pending(current))
593 break;
594
595 /* Recheck PageLRU and PageTransHuge under lock */
596 if (!PageLRU(page))
597 continue;
598 if (PageTransHuge(page)) {
370 low_pfn += (1 << compound_order(page)) - 1; 599 low_pfn += (1 << compound_order(page)) - 1;
371 continue; 600 continue;
372 } 601 }
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
374 if (!cc->sync) 603 if (!cc->sync)
375 mode |= ISOLATE_ASYNC_MIGRATE; 604 mode |= ISOLATE_ASYNC_MIGRATE;
376 605
606 if (unevictable)
607 mode |= ISOLATE_UNEVICTABLE;
608
377 lruvec = mem_cgroup_page_lruvec(page, zone); 609 lruvec = mem_cgroup_page_lruvec(page, zone);
378 610
379 /* Try isolate the page */ 611 /* Try isolate the page */
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
383 VM_BUG_ON(PageTransCompound(page)); 615 VM_BUG_ON(PageTransCompound(page));
384 616
385 /* Successfully isolated */ 617 /* Successfully isolated */
618 cc->finished_update_migrate = true;
386 del_page_from_lru_list(page, lruvec, page_lru(page)); 619 del_page_from_lru_list(page, lruvec, page_lru(page));
387 list_add(&page->lru, migratelist); 620 list_add(&page->lru, migratelist);
388 cc->nr_migratepages++; 621 cc->nr_migratepages++;
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
393 ++low_pfn; 626 ++low_pfn;
394 break; 627 break;
395 } 628 }
629
630 continue;
631
632next_pageblock:
633 low_pfn += pageblock_nr_pages;
634 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
635 last_pageblock_nr = pageblock_nr;
396 } 636 }
397 637
398 acct_isolated(zone, locked, cc); 638 acct_isolated(zone, locked, cc);
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
400 if (locked) 640 if (locked)
401 spin_unlock_irqrestore(&zone->lru_lock, flags); 641 spin_unlock_irqrestore(&zone->lru_lock, flags);
402 642
643 /* Update the pageblock-skip if the whole pageblock was scanned */
644 if (low_pfn == end_pfn)
645 update_pageblock_skip(cc, valid_page, nr_isolated, true);
646
403 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 647 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
404 648
405 return low_pfn; 649 return low_pfn;
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
407 651
408#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 652#endif /* CONFIG_COMPACTION || CONFIG_CMA */
409#ifdef CONFIG_COMPACTION 653#ifdef CONFIG_COMPACTION
410
411/* Returns true if the page is within a block suitable for migration to */
412static bool suitable_migration_target(struct page *page)
413{
414
415 int migratetype = get_pageblock_migratetype(page);
416
417 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
418 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
419 return false;
420
421 /* If the page is a large free page, then allow migration */
422 if (PageBuddy(page) && page_order(page) >= pageblock_order)
423 return true;
424
425 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
426 if (migrate_async_suitable(migratetype))
427 return true;
428
429 /* Otherwise skip the block */
430 return false;
431}
432
433/*
434 * Returns the start pfn of the last page block in a zone. This is the starting
435 * point for full compaction of a zone. Compaction searches for free pages from
436 * the end of each zone, while isolate_freepages_block scans forward inside each
437 * page block.
438 */
439static unsigned long start_free_pfn(struct zone *zone)
440{
441 unsigned long free_pfn;
442 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
443 free_pfn &= ~(pageblock_nr_pages-1);
444 return free_pfn;
445}
446
447/* 654/*
448 * Based on information in the current compact_control, find blocks 655 * Based on information in the current compact_control, find blocks
449 * suitable for isolating free pages from and then isolate them. 656 * suitable for isolating free pages from and then isolate them.
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone,
453{ 660{
454 struct page *page; 661 struct page *page;
455 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; 662 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
456 unsigned long flags;
457 int nr_freepages = cc->nr_freepages; 663 int nr_freepages = cc->nr_freepages;
458 struct list_head *freelist = &cc->freepages; 664 struct list_head *freelist = &cc->freepages;
459 665
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone,
501 if (!suitable_migration_target(page)) 707 if (!suitable_migration_target(page))
502 continue; 708 continue;
503 709
504 /* 710 /* If isolation recently failed, do not retry */
505 * Found a block suitable for isolating free pages from. Now 711 if (!isolation_suitable(cc, page))
506 * we disabled interrupts, double check things are ok and 712 continue;
507 * isolate the pages. This is to minimise the time IRQs
508 * are disabled
509 */
510 isolated = 0;
511 713
512 /* 714 /* Found a block suitable for isolating free pages from */
513 * The zone lock must be held to isolate freepages. This 715 isolated = 0;
514 * unfortunately this is a very coarse lock and can be 716 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
515 * heavily contended if there are parallel allocations 717 isolated = isolate_freepages_block(cc, pfn, end_pfn,
516 * or parallel compactions. For async compaction do not 718 freelist, false);
517 * spin on the lock 719 nr_freepages += isolated;
518 */
519 if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
520 break;
521 if (suitable_migration_target(page)) {
522 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
523 isolated = isolate_freepages_block(pfn, end_pfn,
524 freelist, false);
525 nr_freepages += isolated;
526 }
527 spin_unlock_irqrestore(&zone->lock, flags);
528 720
529 /* 721 /*
530 * Record the highest PFN we isolated pages from. When next 722 * Record the highest PFN we isolated pages from. When next
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone,
532 * page migration may have returned some pages to the allocator 724 * page migration may have returned some pages to the allocator
533 */ 725 */
534 if (isolated) { 726 if (isolated) {
727 cc->finished_update_free = true;
535 high_pfn = max(high_pfn, pfn); 728 high_pfn = max(high_pfn, pfn);
536
537 /*
538 * If the free scanner has wrapped, update
539 * compact_cached_free_pfn to point to the highest
540 * pageblock with free pages. This reduces excessive
541 * scanning of full pageblocks near the end of the
542 * zone
543 */
544 if (cc->order > 0 && cc->wrapped)
545 zone->compact_cached_free_pfn = high_pfn;
546 } 729 }
547 } 730 }
548 731
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone,
551 734
552 cc->free_pfn = high_pfn; 735 cc->free_pfn = high_pfn;
553 cc->nr_freepages = nr_freepages; 736 cc->nr_freepages = nr_freepages;
554
555 /* If compact_cached_free_pfn is reset then set it now */
556 if (cc->order > 0 && !cc->wrapped &&
557 zone->compact_cached_free_pfn == start_free_pfn(zone))
558 zone->compact_cached_free_pfn = high_pfn;
559} 737}
560 738
561/* 739/*
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
633 } 811 }
634 812
635 /* Perform the isolation */ 813 /* Perform the isolation */
636 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); 814 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
637 if (!low_pfn) 815 if (!low_pfn || cc->contended)
638 return ISOLATE_ABORT; 816 return ISOLATE_ABORT;
639 817
640 cc->migrate_pfn = low_pfn; 818 cc->migrate_pfn = low_pfn;
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
645static int compact_finished(struct zone *zone, 823static int compact_finished(struct zone *zone,
646 struct compact_control *cc) 824 struct compact_control *cc)
647{ 825{
648 unsigned int order;
649 unsigned long watermark; 826 unsigned long watermark;
650 827
651 if (fatal_signal_pending(current)) 828 if (fatal_signal_pending(current))
652 return COMPACT_PARTIAL; 829 return COMPACT_PARTIAL;
653 830
654 /* 831 /* Compaction run completes if the migrate and free scanner meet */
655 * A full (order == -1) compaction run starts at the beginning and
656 * end of a zone; it completes when the migrate and free scanner meet.
657 * A partial (order > 0) compaction can start with the free scanner
658 * at a random point in the zone, and may have to restart.
659 */
660 if (cc->free_pfn <= cc->migrate_pfn) { 832 if (cc->free_pfn <= cc->migrate_pfn) {
661 if (cc->order > 0 && !cc->wrapped) { 833 /*
662 /* We started partway through; restart at the end. */ 834 * Mark that the PG_migrate_skip information should be cleared
663 unsigned long free_pfn = start_free_pfn(zone); 835 * by kswapd when it goes to sleep. kswapd does not set the
664 zone->compact_cached_free_pfn = free_pfn; 836 * flag itself as the decision to be clear should be directly
665 cc->free_pfn = free_pfn; 837 * based on an allocation request.
666 cc->wrapped = 1; 838 */
667 return COMPACT_CONTINUE; 839 if (!current_is_kswapd())
668 } 840 zone->compact_blockskip_flush = true;
669 return COMPACT_COMPLETE;
670 }
671 841
672 /* We wrapped around and ended up where we started. */
673 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
674 return COMPACT_COMPLETE; 842 return COMPACT_COMPLETE;
843 }
675 844
676 /* 845 /*
677 * order == -1 is expected when compacting via 846 * order == -1 is expected when compacting via
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone,
688 return COMPACT_CONTINUE; 857 return COMPACT_CONTINUE;
689 858
690 /* Direct compactor: Is a suitable page free? */ 859 /* Direct compactor: Is a suitable page free? */
691 for (order = cc->order; order < MAX_ORDER; order++) { 860 if (cc->page) {
692 /* Job done if page is free of the right migratetype */ 861 /* Was a suitable page captured? */
693 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) 862 if (*cc->page)
694 return COMPACT_PARTIAL;
695
696 /* Job done if allocation would set block type */
697 if (order >= pageblock_order && zone->free_area[order].nr_free)
698 return COMPACT_PARTIAL; 863 return COMPACT_PARTIAL;
864 } else {
865 unsigned int order;
866 for (order = cc->order; order < MAX_ORDER; order++) {
867 struct free_area *area = &zone->free_area[cc->order];
868 /* Job done if page is free of the right migratetype */
869 if (!list_empty(&area->free_list[cc->migratetype]))
870 return COMPACT_PARTIAL;
871
872 /* Job done if allocation would set block type */
873 if (cc->order >= pageblock_order && area->nr_free)
874 return COMPACT_PARTIAL;
875 }
699 } 876 }
700 877
701 return COMPACT_CONTINUE; 878 return COMPACT_CONTINUE;
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
754static int compact_zone(struct zone *zone, struct compact_control *cc) 931static int compact_zone(struct zone *zone, struct compact_control *cc)
755{ 932{
756 int ret; 933 int ret;
934 unsigned long start_pfn = zone->zone_start_pfn;
935 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
757 936
758 ret = compaction_suitable(zone, cc->order); 937 ret = compaction_suitable(zone, cc->order);
759 switch (ret) { 938 switch (ret) {
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
766 ; 945 ;
767 } 946 }
768 947
769 /* Setup to move all movable pages to the end of the zone */ 948 /*
770 cc->migrate_pfn = zone->zone_start_pfn; 949 * Setup to move all movable pages to the end of the zone. Used cached
771 950 * information on where the scanners should start but check that it
772 if (cc->order > 0) { 951 * is initialised by ensuring the values are within zone boundaries.
773 /* Incremental compaction. Start where the last one stopped. */ 952 */
774 cc->free_pfn = zone->compact_cached_free_pfn; 953 cc->migrate_pfn = zone->compact_cached_migrate_pfn;
775 cc->start_free_pfn = cc->free_pfn; 954 cc->free_pfn = zone->compact_cached_free_pfn;
776 } else { 955 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
777 /* Order == -1 starts at the end of the zone. */ 956 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
778 cc->free_pfn = start_free_pfn(zone); 957 zone->compact_cached_free_pfn = cc->free_pfn;
958 }
959 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
960 cc->migrate_pfn = start_pfn;
961 zone->compact_cached_migrate_pfn = cc->migrate_pfn;
779 } 962 }
780 963
964 /*
965 * Clear pageblock skip if there were failures recently and compaction
966 * is about to be retried after being deferred. kswapd does not do
967 * this reset as it'll reset the cached information when going to sleep.
968 */
969 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
970 __reset_isolation_suitable(zone);
971
781 migrate_prep_local(); 972 migrate_prep_local();
782 973
783 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 974 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
787 switch (isolate_migratepages(zone, cc)) { 978 switch (isolate_migratepages(zone, cc)) {
788 case ISOLATE_ABORT: 979 case ISOLATE_ABORT:
789 ret = COMPACT_PARTIAL; 980 ret = COMPACT_PARTIAL;
981 putback_lru_pages(&cc->migratepages);
982 cc->nr_migratepages = 0;
790 goto out; 983 goto out;
791 case ISOLATE_NONE: 984 case ISOLATE_NONE:
792 continue; 985 continue;
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
817 goto out; 1010 goto out;
818 } 1011 }
819 } 1012 }
1013
1014 /* Capture a page now if it is a suitable size */
1015 compact_capture_page(cc);
820 } 1016 }
821 1017
822out: 1018out:
@@ -829,8 +1025,10 @@ out:
829 1025
830static unsigned long compact_zone_order(struct zone *zone, 1026static unsigned long compact_zone_order(struct zone *zone,
831 int order, gfp_t gfp_mask, 1027 int order, gfp_t gfp_mask,
832 bool sync, bool *contended) 1028 bool sync, bool *contended,
1029 struct page **page)
833{ 1030{
1031 unsigned long ret;
834 struct compact_control cc = { 1032 struct compact_control cc = {
835 .nr_freepages = 0, 1033 .nr_freepages = 0,
836 .nr_migratepages = 0, 1034 .nr_migratepages = 0,
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone,
838 .migratetype = allocflags_to_migratetype(gfp_mask), 1036 .migratetype = allocflags_to_migratetype(gfp_mask),
839 .zone = zone, 1037 .zone = zone,
840 .sync = sync, 1038 .sync = sync,
841 .contended = contended, 1039 .page = page,
842 }; 1040 };
843 INIT_LIST_HEAD(&cc.freepages); 1041 INIT_LIST_HEAD(&cc.freepages);
844 INIT_LIST_HEAD(&cc.migratepages); 1042 INIT_LIST_HEAD(&cc.migratepages);
845 1043
846 return compact_zone(zone, &cc); 1044 ret = compact_zone(zone, &cc);
1045
1046 VM_BUG_ON(!list_empty(&cc.freepages));
1047 VM_BUG_ON(!list_empty(&cc.migratepages));
1048
1049 *contended = cc.contended;
1050 return ret;
847} 1051}
848 1052
849int sysctl_extfrag_threshold = 500; 1053int sysctl_extfrag_threshold = 500;
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500;
855 * @gfp_mask: The GFP mask of the current allocation 1059 * @gfp_mask: The GFP mask of the current allocation
856 * @nodemask: The allowed nodes to allocate from 1060 * @nodemask: The allowed nodes to allocate from
857 * @sync: Whether migration is synchronous or not 1061 * @sync: Whether migration is synchronous or not
1062 * @contended: Return value that is true if compaction was aborted due to lock contention
1063 * @page: Optionally capture a free page of the requested order during compaction
858 * 1064 *
859 * This is the main entry point for direct page compaction. 1065 * This is the main entry point for direct page compaction.
860 */ 1066 */
861unsigned long try_to_compact_pages(struct zonelist *zonelist, 1067unsigned long try_to_compact_pages(struct zonelist *zonelist,
862 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1068 int order, gfp_t gfp_mask, nodemask_t *nodemask,
863 bool sync, bool *contended) 1069 bool sync, bool *contended, struct page **page)
864{ 1070{
865 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1071 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
866 int may_enter_fs = gfp_mask & __GFP_FS; 1072 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
868 struct zoneref *z; 1074 struct zoneref *z;
869 struct zone *zone; 1075 struct zone *zone;
870 int rc = COMPACT_SKIPPED; 1076 int rc = COMPACT_SKIPPED;
1077 int alloc_flags = 0;
871 1078
872 /* 1079 /* Check if the GFP flags allow compaction */
873 * Check whether it is worth even starting compaction. The order check is
874 * made because an assumption is made that the page allocator can satisfy
875 * the "cheaper" orders without taking special steps
876 */
877 if (!order || !may_enter_fs || !may_perform_io) 1080 if (!order || !may_enter_fs || !may_perform_io)
878 return rc; 1081 return rc;
879 1082
880 count_vm_event(COMPACTSTALL); 1083 count_vm_event(COMPACTSTALL);
881 1084
1085#ifdef CONFIG_CMA
1086 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1087 alloc_flags |= ALLOC_CMA;
1088#endif
882 /* Compact each zone in the list */ 1089 /* Compact each zone in the list */
883 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1090 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
884 nodemask) { 1091 nodemask) {
885 int status; 1092 int status;
886 1093
887 status = compact_zone_order(zone, order, gfp_mask, sync, 1094 status = compact_zone_order(zone, order, gfp_mask, sync,
888 contended); 1095 contended, page);
889 rc = max(status, rc); 1096 rc = max(status, rc);
890 1097
891 /* If a normal allocation would succeed, stop compacting */ 1098 /* If a normal allocation would succeed, stop compacting */
892 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) 1099 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
1100 alloc_flags))
893 break; 1101 break;
894 } 1102 }
895 1103
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
940 struct compact_control cc = { 1148 struct compact_control cc = {
941 .order = order, 1149 .order = order,
942 .sync = false, 1150 .sync = false,
1151 .page = NULL,
943 }; 1152 };
944 1153
945 return __compact_pgdat(pgdat, &cc); 1154 return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1159,7 @@ static int compact_node(int nid)
950 struct compact_control cc = { 1159 struct compact_control cc = {
951 .order = -1, 1160 .order = -1,
952 .sync = true, 1161 .sync = true,
1162 .page = NULL,
953 }; 1163 };
954 1164
955 return __compact_pgdat(NODE_DATA(nid), &cc); 1165 return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 9b75a045dbf4..a47f0f50c89f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,7 +26,7 @@
26 */ 26 */
27SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) 27SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
28{ 28{
29 struct file *file = fget(fd); 29 struct fd f = fdget(fd);
30 struct address_space *mapping; 30 struct address_space *mapping;
31 struct backing_dev_info *bdi; 31 struct backing_dev_info *bdi;
32 loff_t endbyte; /* inclusive */ 32 loff_t endbyte; /* inclusive */
@@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
35 unsigned long nrpages; 35 unsigned long nrpages;
36 int ret = 0; 36 int ret = 0;
37 37
38 if (!file) 38 if (!f.file)
39 return -EBADF; 39 return -EBADF;
40 40
41 if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { 41 if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
42 ret = -ESPIPE; 42 ret = -ESPIPE;
43 goto out; 43 goto out;
44 } 44 }
45 45
46 mapping = file->f_mapping; 46 mapping = f.file->f_mapping;
47 if (!mapping || len < 0) { 47 if (!mapping || len < 0) {
48 ret = -EINVAL; 48 ret = -EINVAL;
49 goto out; 49 goto out;
@@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
76 76
77 switch (advice) { 77 switch (advice) {
78 case POSIX_FADV_NORMAL: 78 case POSIX_FADV_NORMAL:
79 file->f_ra.ra_pages = bdi->ra_pages; 79 f.file->f_ra.ra_pages = bdi->ra_pages;
80 spin_lock(&file->f_lock); 80 spin_lock(&f.file->f_lock);
81 file->f_mode &= ~FMODE_RANDOM; 81 f.file->f_mode &= ~FMODE_RANDOM;
82 spin_unlock(&file->f_lock); 82 spin_unlock(&f.file->f_lock);
83 break; 83 break;
84 case POSIX_FADV_RANDOM: 84 case POSIX_FADV_RANDOM:
85 spin_lock(&file->f_lock); 85 spin_lock(&f.file->f_lock);
86 file->f_mode |= FMODE_RANDOM; 86 f.file->f_mode |= FMODE_RANDOM;
87 spin_unlock(&file->f_lock); 87 spin_unlock(&f.file->f_lock);
88 break; 88 break;
89 case POSIX_FADV_SEQUENTIAL: 89 case POSIX_FADV_SEQUENTIAL:
90 file->f_ra.ra_pages = bdi->ra_pages * 2; 90 f.file->f_ra.ra_pages = bdi->ra_pages * 2;
91 spin_lock(&file->f_lock); 91 spin_lock(&f.file->f_lock);
92 file->f_mode &= ~FMODE_RANDOM; 92 f.file->f_mode &= ~FMODE_RANDOM;
93 spin_unlock(&file->f_lock); 93 spin_unlock(&f.file->f_lock);
94 break; 94 break;
95 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
96 /* First and last PARTIAL page! */ 96 /* First and last PARTIAL page! */
@@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
106 * Ignore return value because fadvise() shall return 106 * Ignore return value because fadvise() shall return
107 * success even if filesystem can't retrieve a hint, 107 * success even if filesystem can't retrieve a hint,
108 */ 108 */
109 force_page_cache_readahead(mapping, file, start_index, 109 force_page_cache_readahead(mapping, f.file, start_index,
110 nrpages); 110 nrpages);
111 break; 111 break;
112 case POSIX_FADV_NOREUSE: 112 case POSIX_FADV_NOREUSE:
@@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
128 ret = -EINVAL; 128 ret = -EINVAL;
129 } 129 }
130out: 130out:
131 fput(file); 131 fdput(f);
132 return ret; 132 return ret;
133} 133}
134#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS 134#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1607 * Do we have something in the page cache already? 1607 * Do we have something in the page cache already?
1608 */ 1608 */
1609 page = find_get_page(mapping, offset); 1609 page = find_get_page(mapping, offset);
1610 if (likely(page)) { 1610 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1611 /* 1611 /*
1612 * We found the page, so try async readahead before 1612 * We found the page, so try async readahead before
1613 * waiting for the lock. 1613 * waiting for the lock.
1614 */ 1614 */
1615 do_async_mmap_readahead(vma, ra, file, page, offset); 1615 do_async_mmap_readahead(vma, ra, file, page, offset);
1616 } else { 1616 } else if (!page) {
1617 /* No page in the page cache at all */ 1617 /* No page in the page cache at all */
1618 do_sync_mmap_readahead(vma, ra, file, offset); 1618 do_sync_mmap_readahead(vma, ra, file, offset);
1619 count_vm_event(PGMAJFAULT); 1619 count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
1737const struct vm_operations_struct generic_file_vm_ops = { 1737const struct vm_operations_struct generic_file_vm_ops = {
1738 .fault = filemap_fault, 1738 .fault = filemap_fault,
1739 .page_mkwrite = filemap_page_mkwrite, 1739 .page_mkwrite = filemap_page_mkwrite,
1740 .remap_pages = generic_file_remap_pages,
1740}; 1741};
1741 1742
1742/* This is used for a general mmap of a disk file */ 1743/* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1749 return -ENOEXEC; 1750 return -ENOEXEC;
1750 file_accessed(file); 1751 file_accessed(file);
1751 vma->vm_ops = &generic_file_vm_ops; 1752 vma->vm_ops = &generic_file_vm_ops;
1752 vma->vm_flags |= VM_CAN_NONLINEAR;
1753 return 0; 1753 return 0;
1754} 1754}
1755 1755
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
167{ 167{
168 struct vm_area_struct *vma; 168 struct vm_area_struct *vma;
169 struct mm_struct *mm; 169 struct mm_struct *mm;
170 struct prio_tree_iter iter;
171 unsigned long address; 170 unsigned long address;
172 pte_t *pte; 171 pte_t *pte;
173 pte_t pteval; 172 pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
184 183
185retry: 184retry:
186 mutex_lock(&mapping->i_mmap_mutex); 185 mutex_lock(&mapping->i_mmap_mutex);
187 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 186 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
188 mm = vma->vm_mm; 187 mm = vma->vm_mm;
189 address = vma->vm_start + 188 address = vma->vm_start +
190 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
193 if (pte) { 192 if (pte) {
194 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
195 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
196 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush(vma, address, pte);
197 page_remove_rmap(page); 196 page_remove_rmap(page);
198 dec_mm_counter(mm, MM_FILEPAGES); 197 dec_mm_counter(mm, MM_FILEPAGES);
199 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
200 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
200 /* must invalidate_page _before_ freeing the page */
201 mmu_notifier_invalidate_page(mm, address);
201 page_cache_release(page); 202 page_cache_release(page);
202 } 203 }
203 } 204 }
@@ -305,6 +306,7 @@ out:
305static const struct vm_operations_struct xip_file_vm_ops = { 306static const struct vm_operations_struct xip_file_vm_ops = {
306 .fault = xip_file_fault, 307 .fault = xip_file_fault,
307 .page_mkwrite = filemap_page_mkwrite, 308 .page_mkwrite = filemap_page_mkwrite,
309 .remap_pages = generic_file_remap_pages,
308}; 310};
309 311
310int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 312int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
313 315
314 file_accessed(file); 316 file_accessed(file);
315 vma->vm_ops = &xip_file_vm_ops; 317 vma->vm_ops = &xip_file_vm_ops;
316 vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; 318 vma->vm_flags |= VM_MIXEDMAP;
317 return 0; 319 return 0;
318} 320}
319EXPORT_SYMBOL_GPL(xip_file_mmap); 321EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd432467..a0aaf0e56800 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
5 * 5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003 6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */ 7 */
8#include <linux/export.h>
8#include <linux/backing-dev.h> 9#include <linux/backing-dev.h>
9#include <linux/mm.h> 10#include <linux/mm.h>
10#include <linux/swap.h> 11#include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
80 return err; 81 return err;
81} 82}
82 83
83static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, 84int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
84 unsigned long addr, unsigned long size, pgoff_t pgoff) 85 unsigned long size, pgoff_t pgoff)
85{ 86{
87 struct mm_struct *mm = vma->vm_mm;
86 int err; 88 int err;
87 89
88 do { 90 do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
95 pgoff++; 97 pgoff++;
96 } while (size); 98 } while (size);
97 99
98 return 0; 100 return 0;
99
100} 101}
102EXPORT_SYMBOL(generic_file_remap_pages);
101 103
102/** 104/**
103 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma 105 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
167 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) 169 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
168 goto out; 170 goto out;
169 171
170 if (!(vma->vm_flags & VM_CAN_NONLINEAR)) 172 if (!vma->vm_ops || !vma->vm_ops->remap_pages)
171 goto out; 173 goto out;
172 174
173 if (start < vma->vm_start || start + size > vma->vm_end) 175 if (start < vma->vm_start || start + size > vma->vm_end)
@@ -195,10 +197,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
195 */ 197 */
196 if (mapping_cap_account_dirty(mapping)) { 198 if (mapping_cap_account_dirty(mapping)) {
197 unsigned long addr; 199 unsigned long addr;
198 struct file *file = vma->vm_file; 200 struct file *file = get_file(vma->vm_file);
199 201
200 flags &= MAP_NONBLOCK; 202 flags &= MAP_NONBLOCK;
201 get_file(file);
202 addr = mmap_region(file, start, size, 203 addr = mmap_region(file, start, size,
203 flags, vma->vm_flags, pgoff); 204 flags, vma->vm_flags, pgoff);
204 fput(file); 205 fput(file);
@@ -213,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
213 mutex_lock(&mapping->i_mmap_mutex); 214 mutex_lock(&mapping->i_mmap_mutex);
214 flush_dcache_mmap_lock(mapping); 215 flush_dcache_mmap_lock(mapping);
215 vma->vm_flags |= VM_NONLINEAR; 216 vma->vm_flags |= VM_NONLINEAR;
216 vma_prio_tree_remove(vma, &mapping->i_mmap); 217 vma_interval_tree_remove(vma, &mapping->i_mmap);
217 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 218 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
218 flush_dcache_mmap_unlock(mapping); 219 flush_dcache_mmap_unlock(mapping);
219 mutex_unlock(&mapping->i_mmap_mutex); 220 mutex_unlock(&mapping->i_mmap_mutex);
@@ -229,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
229 } 230 }
230 231
231 mmu_notifier_invalidate_range_start(mm, start, start + size); 232 mmu_notifier_invalidate_range_start(mm, start, start + size);
232 err = populate_range(mm, vma, start, size, pgoff); 233 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
233 mmu_notifier_invalidate_range_end(mm, start, start + size); 234 mmu_notifier_invalidate_range_end(mm, start, start + size);
234 if (!err && !(flags & MAP_NONBLOCK)) { 235 if (!err && !(flags & MAP_NONBLOCK)) {
235 if (vma->vm_flags & VM_LOCKED) { 236 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6b3e71a2cd48..2890e67d6026 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled);
44 */ 44 */
45static bool frontswap_writethrough_enabled __read_mostly; 45static bool frontswap_writethrough_enabled __read_mostly;
46 46
47/*
48 * If enabled, the underlying tmem implementation is capable of doing
49 * exclusive gets, so frontswap_load, on a successful tmem_get must
50 * mark the page as no longer in frontswap AND mark it dirty.
51 */
52static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
53
47#ifdef CONFIG_DEBUG_FS 54#ifdef CONFIG_DEBUG_FS
48/* 55/*
49 * Counters available via /sys/kernel/debug/frontswap (if debugfs is 56 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -97,6 +104,15 @@ void frontswap_writethrough(bool enable)
97EXPORT_SYMBOL(frontswap_writethrough); 104EXPORT_SYMBOL(frontswap_writethrough);
98 105
99/* 106/*
107 * Enable/disable frontswap exclusive gets (see above).
108 */
109void frontswap_tmem_exclusive_gets(bool enable)
110{
111 frontswap_tmem_exclusive_gets_enabled = enable;
112}
113EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
114
115/*
100 * Called when a swap device is swapon'd. 116 * Called when a swap device is swapon'd.
101 */ 117 */
102void __frontswap_init(unsigned type) 118void __frontswap_init(unsigned type)
@@ -174,8 +190,13 @@ int __frontswap_load(struct page *page)
174 BUG_ON(sis == NULL); 190 BUG_ON(sis == NULL);
175 if (frontswap_test(sis, offset)) 191 if (frontswap_test(sis, offset))
176 ret = frontswap_ops.load(type, offset, page); 192 ret = frontswap_ops.load(type, offset, page);
177 if (ret == 0) 193 if (ret == 0) {
178 inc_frontswap_loads(); 194 inc_frontswap_loads();
195 if (frontswap_tmem_exclusive_gets_enabled) {
196 SetPageDirty(page);
197 frontswap_clear(sis, offset);
198 }
199 }
179 return ret; 200 return ret;
180} 201}
181EXPORT_SYMBOL(__frontswap_load); 202EXPORT_SYMBOL(__frontswap_load);
@@ -263,6 +284,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
263 return ret; 284 return ret;
264} 285}
265 286
287/*
288 * Used to check if it's necessory and feasible to unuse pages.
289 * Return 1 when nothing to do, 0 when need to shink pages,
290 * error code when there is an error.
291 */
266static int __frontswap_shrink(unsigned long target_pages, 292static int __frontswap_shrink(unsigned long target_pages,
267 unsigned long *pages_to_unuse, 293 unsigned long *pages_to_unuse,
268 int *type) 294 int *type)
@@ -275,7 +301,7 @@ static int __frontswap_shrink(unsigned long target_pages,
275 if (total_pages <= target_pages) { 301 if (total_pages <= target_pages) {
276 /* Nothing to do */ 302 /* Nothing to do */
277 *pages_to_unuse = 0; 303 *pages_to_unuse = 0;
278 return 0; 304 return 1;
279 } 305 }
280 total_pages_to_unuse = total_pages - target_pages; 306 total_pages_to_unuse = total_pages - target_pages;
281 return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); 307 return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
@@ -292,7 +318,7 @@ static int __frontswap_shrink(unsigned long target_pages,
292void frontswap_shrink(unsigned long target_pages) 318void frontswap_shrink(unsigned long target_pages)
293{ 319{
294 unsigned long pages_to_unuse = 0; 320 unsigned long pages_to_unuse = 0;
295 int type, ret; 321 int uninitialized_var(type), ret;
296 322
297 /* 323 /*
298 * we don't want to hold swap_lock while doing a very 324 * we don't want to hold swap_lock while doing a very
@@ -302,7 +328,7 @@ void frontswap_shrink(unsigned long target_pages)
302 spin_lock(&swap_lock); 328 spin_lock(&swap_lock);
303 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); 329 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
304 spin_unlock(&swap_lock); 330 spin_unlock(&swap_lock);
305 if (ret == 0 && pages_to_unuse) 331 if (ret == 0)
306 try_to_unuse(type, true, pages_to_unuse); 332 try_to_unuse(type, true, pages_to_unuse);
307 return; 333 return;
308} 334}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..40f17c34b415 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -17,6 +17,7 @@
17#include <linux/khugepaged.h> 17#include <linux/khugepaged.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/mman.h> 19#include <linux/mman.h>
20#include <linux/pagemap.h>
20#include <asm/tlb.h> 21#include <asm/tlb.h>
21#include <asm/pgalloc.h> 22#include <asm/pgalloc.h>
22#include "internal.h" 23#include "internal.h"
@@ -102,10 +103,7 @@ static int set_recommended_min_free_kbytes(void)
102 unsigned long recommended_min; 103 unsigned long recommended_min;
103 extern int min_free_kbytes; 104 extern int min_free_kbytes;
104 105
105 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, 106 if (!khugepaged_enabled())
106 &transparent_hugepage_flags) &&
107 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
108 &transparent_hugepage_flags))
109 return 0; 107 return 0;
110 108
111 for_each_populated_zone(zone) 109 for_each_populated_zone(zone)
@@ -139,12 +137,6 @@ static int start_khugepaged(void)
139{ 137{
140 int err = 0; 138 int err = 0;
141 if (khugepaged_enabled()) { 139 if (khugepaged_enabled()) {
142 int wakeup;
143 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
144 err = -ENOMEM;
145 goto out;
146 }
147 mutex_lock(&khugepaged_mutex);
148 if (!khugepaged_thread) 140 if (!khugepaged_thread)
149 khugepaged_thread = kthread_run(khugepaged, NULL, 141 khugepaged_thread = kthread_run(khugepaged, NULL,
150 "khugepaged"); 142 "khugepaged");
@@ -154,16 +146,16 @@ static int start_khugepaged(void)
154 err = PTR_ERR(khugepaged_thread); 146 err = PTR_ERR(khugepaged_thread);
155 khugepaged_thread = NULL; 147 khugepaged_thread = NULL;
156 } 148 }
157 wakeup = !list_empty(&khugepaged_scan.mm_head); 149
158 mutex_unlock(&khugepaged_mutex); 150 if (!list_empty(&khugepaged_scan.mm_head))
159 if (wakeup)
160 wake_up_interruptible(&khugepaged_wait); 151 wake_up_interruptible(&khugepaged_wait);
161 152
162 set_recommended_min_free_kbytes(); 153 set_recommended_min_free_kbytes();
163 } else 154 } else if (khugepaged_thread) {
164 /* wakeup to exit */ 155 kthread_stop(khugepaged_thread);
165 wake_up_interruptible(&khugepaged_wait); 156 khugepaged_thread = NULL;
166out: 157 }
158
167 return err; 159 return err;
168} 160}
169 161
@@ -224,18 +216,16 @@ static ssize_t enabled_store(struct kobject *kobj,
224 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 216 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
225 217
226 if (ret > 0) { 218 if (ret > 0) {
227 int err = start_khugepaged(); 219 int err;
220
221 mutex_lock(&khugepaged_mutex);
222 err = start_khugepaged();
223 mutex_unlock(&khugepaged_mutex);
224
228 if (err) 225 if (err)
229 ret = err; 226 ret = err;
230 } 227 }
231 228
232 if (ret > 0 &&
233 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
234 &transparent_hugepage_flags) ||
235 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
236 &transparent_hugepage_flags)))
237 set_recommended_min_free_kbytes();
238
239 return ret; 229 return ret;
240} 230}
241static struct kobj_attribute enabled_attr = 231static struct kobj_attribute enabled_attr =
@@ -570,8 +560,6 @@ static int __init hugepage_init(void)
570 560
571 start_khugepaged(); 561 start_khugepaged();
572 562
573 set_recommended_min_free_kbytes();
574
575 return 0; 563 return 0;
576out: 564out:
577 hugepage_exit_sysfs(hugepage_kobj); 565 hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +599,6 @@ out:
611} 599}
612__setup("transparent_hugepage=", setup_transparent_hugepage); 600__setup("transparent_hugepage=", setup_transparent_hugepage);
613 601
614static void prepare_pmd_huge_pte(pgtable_t pgtable,
615 struct mm_struct *mm)
616{
617 assert_spin_locked(&mm->page_table_lock);
618
619 /* FIFO */
620 if (!mm->pmd_huge_pte)
621 INIT_LIST_HEAD(&pgtable->lru);
622 else
623 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
624 mm->pmd_huge_pte = pgtable;
625}
626
627static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 602static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
628{ 603{
629 if (likely(vma->vm_flags & VM_WRITE)) 604 if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +640,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
665 */ 640 */
666 page_add_new_anon_rmap(page, vma, haddr); 641 page_add_new_anon_rmap(page, vma, haddr);
667 set_pmd_at(mm, haddr, pmd, entry); 642 set_pmd_at(mm, haddr, pmd, entry);
668 prepare_pmd_huge_pte(pgtable, mm); 643 pgtable_trans_huge_deposit(mm, pgtable);
669 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 644 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
670 mm->nr_ptes++; 645 mm->nr_ptes++;
671 spin_unlock(&mm->page_table_lock); 646 spin_unlock(&mm->page_table_lock);
@@ -791,7 +766,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
791 pmdp_set_wrprotect(src_mm, addr, src_pmd); 766 pmdp_set_wrprotect(src_mm, addr, src_pmd);
792 pmd = pmd_mkold(pmd_wrprotect(pmd)); 767 pmd = pmd_mkold(pmd_wrprotect(pmd));
793 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 768 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
794 prepare_pmd_huge_pte(pgtable, dst_mm); 769 pgtable_trans_huge_deposit(dst_mm, pgtable);
795 dst_mm->nr_ptes++; 770 dst_mm->nr_ptes++;
796 771
797 ret = 0; 772 ret = 0;
@@ -802,25 +777,6 @@ out:
802 return ret; 777 return ret;
803} 778}
804 779
805/* no "address" argument so destroys page coloring of some arch */
806pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
807{
808 pgtable_t pgtable;
809
810 assert_spin_locked(&mm->page_table_lock);
811
812 /* FIFO */
813 pgtable = mm->pmd_huge_pte;
814 if (list_empty(&pgtable->lru))
815 mm->pmd_huge_pte = NULL;
816 else {
817 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
818 struct page, lru);
819 list_del(&pgtable->lru);
820 }
821 return pgtable;
822}
823
824static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 780static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
825 struct vm_area_struct *vma, 781 struct vm_area_struct *vma,
826 unsigned long address, 782 unsigned long address,
@@ -832,6 +788,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
832 pmd_t _pmd; 788 pmd_t _pmd;
833 int ret = 0, i; 789 int ret = 0, i;
834 struct page **pages; 790 struct page **pages;
791 unsigned long mmun_start; /* For mmu_notifiers */
792 unsigned long mmun_end; /* For mmu_notifiers */
835 793
836 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 794 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
837 GFP_KERNEL); 795 GFP_KERNEL);
@@ -868,15 +826,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
868 cond_resched(); 826 cond_resched();
869 } 827 }
870 828
829 mmun_start = haddr;
830 mmun_end = haddr + HPAGE_PMD_SIZE;
831 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
832
871 spin_lock(&mm->page_table_lock); 833 spin_lock(&mm->page_table_lock);
872 if (unlikely(!pmd_same(*pmd, orig_pmd))) 834 if (unlikely(!pmd_same(*pmd, orig_pmd)))
873 goto out_free_pages; 835 goto out_free_pages;
874 VM_BUG_ON(!PageHead(page)); 836 VM_BUG_ON(!PageHead(page));
875 837
876 pmdp_clear_flush_notify(vma, haddr, pmd); 838 pmdp_clear_flush(vma, haddr, pmd);
877 /* leave pmd empty until pte is filled */ 839 /* leave pmd empty until pte is filled */
878 840
879 pgtable = get_pmd_huge_pte(mm); 841 pgtable = pgtable_trans_huge_withdraw(mm);
880 pmd_populate(mm, &_pmd, pgtable); 842 pmd_populate(mm, &_pmd, pgtable);
881 843
882 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 844 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +858,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
896 page_remove_rmap(page); 858 page_remove_rmap(page);
897 spin_unlock(&mm->page_table_lock); 859 spin_unlock(&mm->page_table_lock);
898 860
861 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
862
899 ret |= VM_FAULT_WRITE; 863 ret |= VM_FAULT_WRITE;
900 put_page(page); 864 put_page(page);
901 865
@@ -904,6 +868,7 @@ out:
904 868
905out_free_pages: 869out_free_pages:
906 spin_unlock(&mm->page_table_lock); 870 spin_unlock(&mm->page_table_lock);
871 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
907 mem_cgroup_uncharge_start(); 872 mem_cgroup_uncharge_start();
908 for (i = 0; i < HPAGE_PMD_NR; i++) { 873 for (i = 0; i < HPAGE_PMD_NR; i++) {
909 mem_cgroup_uncharge_page(pages[i]); 874 mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +885,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
920 int ret = 0; 885 int ret = 0;
921 struct page *page, *new_page; 886 struct page *page, *new_page;
922 unsigned long haddr; 887 unsigned long haddr;
888 unsigned long mmun_start; /* For mmu_notifiers */
889 unsigned long mmun_end; /* For mmu_notifiers */
923 890
924 VM_BUG_ON(!vma->anon_vma); 891 VM_BUG_ON(!vma->anon_vma);
925 spin_lock(&mm->page_table_lock); 892 spin_lock(&mm->page_table_lock);
@@ -934,7 +901,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
934 entry = pmd_mkyoung(orig_pmd); 901 entry = pmd_mkyoung(orig_pmd);
935 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 902 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
936 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 903 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
937 update_mmu_cache(vma, address, entry); 904 update_mmu_cache_pmd(vma, address, pmd);
938 ret |= VM_FAULT_WRITE; 905 ret |= VM_FAULT_WRITE;
939 goto out_unlock; 906 goto out_unlock;
940 } 907 }
@@ -970,38 +937,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
970 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 937 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
971 __SetPageUptodate(new_page); 938 __SetPageUptodate(new_page);
972 939
940 mmun_start = haddr;
941 mmun_end = haddr + HPAGE_PMD_SIZE;
942 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
943
973 spin_lock(&mm->page_table_lock); 944 spin_lock(&mm->page_table_lock);
974 put_page(page); 945 put_page(page);
975 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 946 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock); 947 spin_unlock(&mm->page_table_lock);
977 mem_cgroup_uncharge_page(new_page); 948 mem_cgroup_uncharge_page(new_page);
978 put_page(new_page); 949 put_page(new_page);
979 goto out; 950 goto out_mn;
980 } else { 951 } else {
981 pmd_t entry; 952 pmd_t entry;
982 VM_BUG_ON(!PageHead(page)); 953 VM_BUG_ON(!PageHead(page));
983 entry = mk_pmd(new_page, vma->vm_page_prot); 954 entry = mk_pmd(new_page, vma->vm_page_prot);
984 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 955 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
985 entry = pmd_mkhuge(entry); 956 entry = pmd_mkhuge(entry);
986 pmdp_clear_flush_notify(vma, haddr, pmd); 957 pmdp_clear_flush(vma, haddr, pmd);
987 page_add_new_anon_rmap(new_page, vma, haddr); 958 page_add_new_anon_rmap(new_page, vma, haddr);
988 set_pmd_at(mm, haddr, pmd, entry); 959 set_pmd_at(mm, haddr, pmd, entry);
989 update_mmu_cache(vma, address, entry); 960 update_mmu_cache_pmd(vma, address, pmd);
990 page_remove_rmap(page); 961 page_remove_rmap(page);
991 put_page(page); 962 put_page(page);
992 ret |= VM_FAULT_WRITE; 963 ret |= VM_FAULT_WRITE;
993 } 964 }
994out_unlock:
995 spin_unlock(&mm->page_table_lock); 965 spin_unlock(&mm->page_table_lock);
966out_mn:
967 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
996out: 968out:
997 return ret; 969 return ret;
970out_unlock:
971 spin_unlock(&mm->page_table_lock);
972 return ret;
998} 973}
999 974
1000struct page *follow_trans_huge_pmd(struct mm_struct *mm, 975struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1001 unsigned long addr, 976 unsigned long addr,
1002 pmd_t *pmd, 977 pmd_t *pmd,
1003 unsigned int flags) 978 unsigned int flags)
1004{ 979{
980 struct mm_struct *mm = vma->vm_mm;
1005 struct page *page = NULL; 981 struct page *page = NULL;
1006 982
1007 assert_spin_locked(&mm->page_table_lock); 983 assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +1000,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
1024 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1000 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1025 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1001 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1026 } 1002 }
1003 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1004 if (page->mapping && trylock_page(page)) {
1005 lru_add_drain();
1006 if (page->mapping)
1007 mlock_vma_page(page);
1008 unlock_page(page);
1009 }
1010 }
1027 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1011 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1028 VM_BUG_ON(!PageCompound(page)); 1012 VM_BUG_ON(!PageCompound(page));
1029 if (flags & FOLL_GET) 1013 if (flags & FOLL_GET)
@@ -1041,9 +1025,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1041 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1025 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1042 struct page *page; 1026 struct page *page;
1043 pgtable_t pgtable; 1027 pgtable_t pgtable;
1044 pgtable = get_pmd_huge_pte(tlb->mm); 1028 pmd_t orig_pmd;
1045 page = pmd_page(*pmd); 1029 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1046 pmd_clear(pmd); 1030 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1031 page = pmd_page(orig_pmd);
1047 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1032 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1048 page_remove_rmap(page); 1033 page_remove_rmap(page);
1049 VM_BUG_ON(page_mapcount(page) < 0); 1034 VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1192,11 @@ static int __split_huge_page_splitting(struct page *page,
1207 struct mm_struct *mm = vma->vm_mm; 1192 struct mm_struct *mm = vma->vm_mm;
1208 pmd_t *pmd; 1193 pmd_t *pmd;
1209 int ret = 0; 1194 int ret = 0;
1195 /* For mmu_notifiers */
1196 const unsigned long mmun_start = address;
1197 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1210 1198
1199 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1211 spin_lock(&mm->page_table_lock); 1200 spin_lock(&mm->page_table_lock);
1212 pmd = page_check_address_pmd(page, mm, address, 1201 pmd = page_check_address_pmd(page, mm, address,
1213 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1202 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1208,11 @@ static int __split_huge_page_splitting(struct page *page,
1219 * and it won't wait on the anon_vma->root->mutex to 1208 * and it won't wait on the anon_vma->root->mutex to
1220 * serialize against split_huge_page*. 1209 * serialize against split_huge_page*.
1221 */ 1210 */
1222 pmdp_splitting_flush_notify(vma, address, pmd); 1211 pmdp_splitting_flush(vma, address, pmd);
1223 ret = 1; 1212 ret = 1;
1224 } 1213 }
1225 spin_unlock(&mm->page_table_lock); 1214 spin_unlock(&mm->page_table_lock);
1215 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1226 1216
1227 return ret; 1217 return ret;
1228} 1218}
@@ -1358,11 +1348,11 @@ static int __split_huge_page_map(struct page *page,
1358 pmd = page_check_address_pmd(page, mm, address, 1348 pmd = page_check_address_pmd(page, mm, address,
1359 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1349 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1360 if (pmd) { 1350 if (pmd) {
1361 pgtable = get_pmd_huge_pte(mm); 1351 pgtable = pgtable_trans_huge_withdraw(mm);
1362 pmd_populate(mm, &_pmd, pgtable); 1352 pmd_populate(mm, &_pmd, pgtable);
1363 1353
1364 for (i = 0, haddr = address; i < HPAGE_PMD_NR; 1354 haddr = address;
1365 i++, haddr += PAGE_SIZE) { 1355 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1366 pte_t *pte, entry; 1356 pte_t *pte, entry;
1367 BUG_ON(PageCompound(page+i)); 1357 BUG_ON(PageCompound(page+i));
1368 entry = mk_pte(page + i, vma->vm_page_prot); 1358 entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1396,7 @@ static int __split_huge_page_map(struct page *page,
1406 * SMP TLB and finally we write the non-huge version 1396 * SMP TLB and finally we write the non-huge version
1407 * of the pmd entry with pmd_populate. 1397 * of the pmd entry with pmd_populate.
1408 */ 1398 */
1409 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); 1399 pmdp_invalidate(vma, address, pmd);
1410 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1411 pmd_populate(mm, pmd, pgtable); 1400 pmd_populate(mm, pmd, pgtable);
1412 ret = 1; 1401 ret = 1;
1413 } 1402 }
@@ -1421,18 +1410,17 @@ static void __split_huge_page(struct page *page,
1421 struct anon_vma *anon_vma) 1410 struct anon_vma *anon_vma)
1422{ 1411{
1423 int mapcount, mapcount2; 1412 int mapcount, mapcount2;
1413 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1424 struct anon_vma_chain *avc; 1414 struct anon_vma_chain *avc;
1425 1415
1426 BUG_ON(!PageHead(page)); 1416 BUG_ON(!PageHead(page));
1427 BUG_ON(PageTail(page)); 1417 BUG_ON(PageTail(page));
1428 1418
1429 mapcount = 0; 1419 mapcount = 0;
1430 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1420 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1431 struct vm_area_struct *vma = avc->vma; 1421 struct vm_area_struct *vma = avc->vma;
1432 unsigned long addr = vma_address(page, vma); 1422 unsigned long addr = vma_address(page, vma);
1433 BUG_ON(is_vma_temporary_stack(vma)); 1423 BUG_ON(is_vma_temporary_stack(vma));
1434 if (addr == -EFAULT)
1435 continue;
1436 mapcount += __split_huge_page_splitting(page, vma, addr); 1424 mapcount += __split_huge_page_splitting(page, vma, addr);
1437 } 1425 }
1438 /* 1426 /*
@@ -1453,12 +1441,10 @@ static void __split_huge_page(struct page *page,
1453 __split_huge_page_refcount(page); 1441 __split_huge_page_refcount(page);
1454 1442
1455 mapcount2 = 0; 1443 mapcount2 = 0;
1456 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1444 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1457 struct vm_area_struct *vma = avc->vma; 1445 struct vm_area_struct *vma = avc->vma;
1458 unsigned long addr = vma_address(page, vma); 1446 unsigned long addr = vma_address(page, vma);
1459 BUG_ON(is_vma_temporary_stack(vma)); 1447 BUG_ON(is_vma_temporary_stack(vma));
1460 if (addr == -EFAULT)
1461 continue;
1462 mapcount2 += __split_huge_page_map(page, vma, addr); 1448 mapcount2 += __split_huge_page_map(page, vma, addr);
1463 } 1449 }
1464 if (mapcount != mapcount2) 1450 if (mapcount != mapcount2)
@@ -1491,12 +1477,13 @@ out:
1491 return ret; 1477 return ret;
1492} 1478}
1493 1479
1494#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ 1480#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1495 VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1496 1481
1497int hugepage_madvise(struct vm_area_struct *vma, 1482int hugepage_madvise(struct vm_area_struct *vma,
1498 unsigned long *vm_flags, int advice) 1483 unsigned long *vm_flags, int advice)
1499{ 1484{
1485 struct mm_struct *mm = vma->vm_mm;
1486
1500 switch (advice) { 1487 switch (advice) {
1501 case MADV_HUGEPAGE: 1488 case MADV_HUGEPAGE:
1502 /* 1489 /*
@@ -1504,6 +1491,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
1504 */ 1491 */
1505 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1492 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1506 return -EINVAL; 1493 return -EINVAL;
1494 if (mm->def_flags & VM_NOHUGEPAGE)
1495 return -EINVAL;
1507 *vm_flags &= ~VM_NOHUGEPAGE; 1496 *vm_flags &= ~VM_NOHUGEPAGE;
1508 *vm_flags |= VM_HUGEPAGE; 1497 *vm_flags |= VM_HUGEPAGE;
1509 /* 1498 /*
@@ -1655,11 +1644,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1655 if (vma->vm_ops) 1644 if (vma->vm_ops)
1656 /* khugepaged not yet working on file or special mappings */ 1645 /* khugepaged not yet working on file or special mappings */
1657 return 0; 1646 return 0;
1658 /* 1647 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1659 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1660 * true too, verify it here.
1661 */
1662 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1663 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1648 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1664 hend = vma->vm_end & HPAGE_PMD_MASK; 1649 hend = vma->vm_end & HPAGE_PMD_MASK;
1665 if (hstart < hend) 1650 if (hstart < hend)
@@ -1833,28 +1818,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1833 } 1818 }
1834} 1819}
1835 1820
1836static void collapse_huge_page(struct mm_struct *mm, 1821static void khugepaged_alloc_sleep(void)
1837 unsigned long address,
1838 struct page **hpage,
1839 struct vm_area_struct *vma,
1840 int node)
1841{ 1822{
1842 pgd_t *pgd; 1823 wait_event_freezable_timeout(khugepaged_wait, false,
1843 pud_t *pud; 1824 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
1844 pmd_t *pmd, _pmd; 1825}
1845 pte_t *pte;
1846 pgtable_t pgtable;
1847 struct page *new_page;
1848 spinlock_t *ptl;
1849 int isolated;
1850 unsigned long hstart, hend;
1851 1826
1852 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1827#ifdef CONFIG_NUMA
1853#ifndef CONFIG_NUMA 1828static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1854 up_read(&mm->mmap_sem); 1829{
1855 VM_BUG_ON(!*hpage); 1830 if (IS_ERR(*hpage)) {
1856 new_page = *hpage; 1831 if (!*wait)
1857#else 1832 return false;
1833
1834 *wait = false;
1835 *hpage = NULL;
1836 khugepaged_alloc_sleep();
1837 } else if (*hpage) {
1838 put_page(*hpage);
1839 *hpage = NULL;
1840 }
1841
1842 return true;
1843}
1844
1845static struct page
1846*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1847 struct vm_area_struct *vma, unsigned long address,
1848 int node)
1849{
1858 VM_BUG_ON(*hpage); 1850 VM_BUG_ON(*hpage);
1859 /* 1851 /*
1860 * Allocate the page while the vma is still valid and under 1852 * Allocate the page while the vma is still valid and under
@@ -1866,7 +1858,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1866 * mmap_sem in read mode is good idea also to allow greater 1858 * mmap_sem in read mode is good idea also to allow greater
1867 * scalability. 1859 * scalability.
1868 */ 1860 */
1869 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 1861 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1870 node, __GFP_OTHER_NODE); 1862 node, __GFP_OTHER_NODE);
1871 1863
1872 /* 1864 /*
@@ -1874,20 +1866,85 @@ static void collapse_huge_page(struct mm_struct *mm,
1874 * preparation for taking it in write mode. 1866 * preparation for taking it in write mode.
1875 */ 1867 */
1876 up_read(&mm->mmap_sem); 1868 up_read(&mm->mmap_sem);
1877 if (unlikely(!new_page)) { 1869 if (unlikely(!*hpage)) {
1878 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1870 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1879 *hpage = ERR_PTR(-ENOMEM); 1871 *hpage = ERR_PTR(-ENOMEM);
1880 return; 1872 return NULL;
1881 } 1873 }
1882#endif
1883 1874
1884 count_vm_event(THP_COLLAPSE_ALLOC); 1875 count_vm_event(THP_COLLAPSE_ALLOC);
1885 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1876 return *hpage;
1886#ifdef CONFIG_NUMA 1877}
1887 put_page(new_page); 1878#else
1879static struct page *khugepaged_alloc_hugepage(bool *wait)
1880{
1881 struct page *hpage;
1882
1883 do {
1884 hpage = alloc_hugepage(khugepaged_defrag());
1885 if (!hpage) {
1886 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1887 if (!*wait)
1888 return NULL;
1889
1890 *wait = false;
1891 khugepaged_alloc_sleep();
1892 } else
1893 count_vm_event(THP_COLLAPSE_ALLOC);
1894 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
1895
1896 return hpage;
1897}
1898
1899static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1900{
1901 if (!*hpage)
1902 *hpage = khugepaged_alloc_hugepage(wait);
1903
1904 if (unlikely(!*hpage))
1905 return false;
1906
1907 return true;
1908}
1909
1910static struct page
1911*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1912 struct vm_area_struct *vma, unsigned long address,
1913 int node)
1914{
1915 up_read(&mm->mmap_sem);
1916 VM_BUG_ON(!*hpage);
1917 return *hpage;
1918}
1888#endif 1919#endif
1920
1921static void collapse_huge_page(struct mm_struct *mm,
1922 unsigned long address,
1923 struct page **hpage,
1924 struct vm_area_struct *vma,
1925 int node)
1926{
1927 pgd_t *pgd;
1928 pud_t *pud;
1929 pmd_t *pmd, _pmd;
1930 pte_t *pte;
1931 pgtable_t pgtable;
1932 struct page *new_page;
1933 spinlock_t *ptl;
1934 int isolated;
1935 unsigned long hstart, hend;
1936 unsigned long mmun_start; /* For mmu_notifiers */
1937 unsigned long mmun_end; /* For mmu_notifiers */
1938
1939 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1940
1941 /* release the mmap_sem read lock. */
1942 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
1943 if (!new_page)
1944 return;
1945
1946 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
1889 return; 1947 return;
1890 }
1891 1948
1892 /* 1949 /*
1893 * Prevent all access to pagetables with the exception of 1950 * Prevent all access to pagetables with the exception of
@@ -1912,11 +1969,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1912 goto out; 1969 goto out;
1913 if (is_vma_temporary_stack(vma)) 1970 if (is_vma_temporary_stack(vma))
1914 goto out; 1971 goto out;
1915 /* 1972 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1916 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1917 * true too, verify it here.
1918 */
1919 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1920 1973
1921 pgd = pgd_offset(mm, address); 1974 pgd = pgd_offset(mm, address);
1922 if (!pgd_present(*pgd)) 1975 if (!pgd_present(*pgd))
@@ -1936,6 +1989,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1936 pte = pte_offset_map(pmd, address); 1989 pte = pte_offset_map(pmd, address);
1937 ptl = pte_lockptr(mm, pmd); 1990 ptl = pte_lockptr(mm, pmd);
1938 1991
1992 mmun_start = address;
1993 mmun_end = address + HPAGE_PMD_SIZE;
1994 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1939 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 1995 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1940 /* 1996 /*
1941 * After this gup_fast can't run anymore. This also removes 1997 * After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1999,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1943 * huge and small TLB entries for the same virtual address 1999 * huge and small TLB entries for the same virtual address
1944 * to avoid the risk of CPU bugs in that area. 2000 * to avoid the risk of CPU bugs in that area.
1945 */ 2001 */
1946 _pmd = pmdp_clear_flush_notify(vma, address, pmd); 2002 _pmd = pmdp_clear_flush(vma, address, pmd);
1947 spin_unlock(&mm->page_table_lock); 2003 spin_unlock(&mm->page_table_lock);
2004 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1948 2005
1949 spin_lock(ptl); 2006 spin_lock(ptl);
1950 isolated = __collapse_huge_page_isolate(vma, address, pte); 2007 isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2027,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1970 pte_unmap(pte); 2027 pte_unmap(pte);
1971 __SetPageUptodate(new_page); 2028 __SetPageUptodate(new_page);
1972 pgtable = pmd_pgtable(_pmd); 2029 pgtable = pmd_pgtable(_pmd);
1973 VM_BUG_ON(page_count(pgtable) != 1);
1974 VM_BUG_ON(page_mapcount(pgtable) != 0);
1975 2030
1976 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2031 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1977 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2032 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2043,12 @@ static void collapse_huge_page(struct mm_struct *mm,
1988 BUG_ON(!pmd_none(*pmd)); 2043 BUG_ON(!pmd_none(*pmd));
1989 page_add_new_anon_rmap(new_page, vma, address); 2044 page_add_new_anon_rmap(new_page, vma, address);
1990 set_pmd_at(mm, address, pmd, _pmd); 2045 set_pmd_at(mm, address, pmd, _pmd);
1991 update_mmu_cache(vma, address, _pmd); 2046 update_mmu_cache_pmd(vma, address, pmd);
1992 prepare_pmd_huge_pte(pgtable, mm); 2047 pgtable_trans_huge_deposit(mm, pgtable);
1993 spin_unlock(&mm->page_table_lock); 2048 spin_unlock(&mm->page_table_lock);
1994 2049
1995#ifndef CONFIG_NUMA
1996 *hpage = NULL; 2050 *hpage = NULL;
1997#endif 2051
1998 khugepaged_pages_collapsed++; 2052 khugepaged_pages_collapsed++;
1999out_up_write: 2053out_up_write:
2000 up_write(&mm->mmap_sem); 2054 up_write(&mm->mmap_sem);
@@ -2002,9 +2056,6 @@ out_up_write:
2002 2056
2003out: 2057out:
2004 mem_cgroup_uncharge_page(new_page); 2058 mem_cgroup_uncharge_page(new_page);
2005#ifdef CONFIG_NUMA
2006 put_page(new_page);
2007#endif
2008 goto out_up_write; 2059 goto out_up_write;
2009} 2060}
2010 2061
@@ -2154,12 +2205,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2154 goto skip; 2205 goto skip;
2155 if (is_vma_temporary_stack(vma)) 2206 if (is_vma_temporary_stack(vma))
2156 goto skip; 2207 goto skip;
2157 /* 2208 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2158 * If is_pfn_mapping() is true is_learn_pfn_mapping()
2159 * must be true too, verify it here.
2160 */
2161 VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2162 vma->vm_flags & VM_NO_THP);
2163 2209
2164 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2210 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2165 hend = vma->vm_end & HPAGE_PMD_MASK; 2211 hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2280,23 @@ static int khugepaged_has_work(void)
2234static int khugepaged_wait_event(void) 2280static int khugepaged_wait_event(void)
2235{ 2281{
2236 return !list_empty(&khugepaged_scan.mm_head) || 2282 return !list_empty(&khugepaged_scan.mm_head) ||
2237 !khugepaged_enabled(); 2283 kthread_should_stop();
2238} 2284}
2239 2285
2240static void khugepaged_do_scan(struct page **hpage) 2286static void khugepaged_do_scan(void)
2241{ 2287{
2288 struct page *hpage = NULL;
2242 unsigned int progress = 0, pass_through_head = 0; 2289 unsigned int progress = 0, pass_through_head = 0;
2243 unsigned int pages = khugepaged_pages_to_scan; 2290 unsigned int pages = khugepaged_pages_to_scan;
2291 bool wait = true;
2244 2292
2245 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2293 barrier(); /* write khugepaged_pages_to_scan to local stack */
2246 2294
2247 while (progress < pages) { 2295 while (progress < pages) {
2248 cond_resched(); 2296 if (!khugepaged_prealloc_page(&hpage, &wait))
2249
2250#ifndef CONFIG_NUMA
2251 if (!*hpage) {
2252 *hpage = alloc_hugepage(khugepaged_defrag());
2253 if (unlikely(!*hpage)) {
2254 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2255 break;
2256 }
2257 count_vm_event(THP_COLLAPSE_ALLOC);
2258 }
2259#else
2260 if (IS_ERR(*hpage))
2261 break; 2297 break;
2262#endif 2298
2299 cond_resched();
2263 2300
2264 if (unlikely(kthread_should_stop() || freezing(current))) 2301 if (unlikely(kthread_should_stop() || freezing(current)))
2265 break; 2302 break;
@@ -2270,73 +2307,32 @@ static void khugepaged_do_scan(struct page **hpage)
2270 if (khugepaged_has_work() && 2307 if (khugepaged_has_work() &&
2271 pass_through_head < 2) 2308 pass_through_head < 2)
2272 progress += khugepaged_scan_mm_slot(pages - progress, 2309 progress += khugepaged_scan_mm_slot(pages - progress,
2273 hpage); 2310 &hpage);
2274 else 2311 else
2275 progress = pages; 2312 progress = pages;
2276 spin_unlock(&khugepaged_mm_lock); 2313 spin_unlock(&khugepaged_mm_lock);
2277 } 2314 }
2278}
2279 2315
2280static void khugepaged_alloc_sleep(void) 2316 if (!IS_ERR_OR_NULL(hpage))
2281{ 2317 put_page(hpage);
2282 wait_event_freezable_timeout(khugepaged_wait, false,
2283 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2284} 2318}
2285 2319
2286#ifndef CONFIG_NUMA 2320static void khugepaged_wait_work(void)
2287static struct page *khugepaged_alloc_hugepage(void)
2288{ 2321{
2289 struct page *hpage; 2322 try_to_freeze();
2290
2291 do {
2292 hpage = alloc_hugepage(khugepaged_defrag());
2293 if (!hpage) {
2294 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2295 khugepaged_alloc_sleep();
2296 } else
2297 count_vm_event(THP_COLLAPSE_ALLOC);
2298 } while (unlikely(!hpage) &&
2299 likely(khugepaged_enabled()));
2300 return hpage;
2301}
2302#endif
2303 2323
2304static void khugepaged_loop(void) 2324 if (khugepaged_has_work()) {
2305{ 2325 if (!khugepaged_scan_sleep_millisecs)
2306 struct page *hpage; 2326 return;
2307 2327
2308#ifdef CONFIG_NUMA 2328 wait_event_freezable_timeout(khugepaged_wait,
2309 hpage = NULL; 2329 kthread_should_stop(),
2310#endif 2330 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2311 while (likely(khugepaged_enabled())) { 2331 return;
2312#ifndef CONFIG_NUMA
2313 hpage = khugepaged_alloc_hugepage();
2314 if (unlikely(!hpage))
2315 break;
2316#else
2317 if (IS_ERR(hpage)) {
2318 khugepaged_alloc_sleep();
2319 hpage = NULL;
2320 }
2321#endif
2322
2323 khugepaged_do_scan(&hpage);
2324#ifndef CONFIG_NUMA
2325 if (hpage)
2326 put_page(hpage);
2327#endif
2328 try_to_freeze();
2329 if (unlikely(kthread_should_stop()))
2330 break;
2331 if (khugepaged_has_work()) {
2332 if (!khugepaged_scan_sleep_millisecs)
2333 continue;
2334 wait_event_freezable_timeout(khugepaged_wait, false,
2335 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2336 } else if (khugepaged_enabled())
2337 wait_event_freezable(khugepaged_wait,
2338 khugepaged_wait_event());
2339 } 2332 }
2333
2334 if (khugepaged_enabled())
2335 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2340} 2336}
2341 2337
2342static int khugepaged(void *none) 2338static int khugepaged(void *none)
@@ -2346,20 +2342,9 @@ static int khugepaged(void *none)
2346 set_freezable(); 2342 set_freezable();
2347 set_user_nice(current, 19); 2343 set_user_nice(current, 19);
2348 2344
2349 /* serialize with start_khugepaged() */ 2345 while (!kthread_should_stop()) {
2350 mutex_lock(&khugepaged_mutex); 2346 khugepaged_do_scan();
2351 2347 khugepaged_wait_work();
2352 for (;;) {
2353 mutex_unlock(&khugepaged_mutex);
2354 VM_BUG_ON(khugepaged_thread != current);
2355 khugepaged_loop();
2356 VM_BUG_ON(khugepaged_thread != current);
2357
2358 mutex_lock(&khugepaged_mutex);
2359 if (!khugepaged_enabled())
2360 break;
2361 if (unlikely(kthread_should_stop()))
2362 break;
2363 } 2348 }
2364 2349
2365 spin_lock(&khugepaged_mm_lock); 2350 spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2353,6 @@ static int khugepaged(void *none)
2368 if (mm_slot) 2353 if (mm_slot)
2369 collect_mm_slot(mm_slot); 2354 collect_mm_slot(mm_slot);
2370 spin_unlock(&khugepaged_mm_lock); 2355 spin_unlock(&khugepaged_mm_lock);
2371
2372 khugepaged_thread = NULL;
2373 mutex_unlock(&khugepaged_mutex);
2374
2375 return 0; 2356 return 0;
2376} 2357}
2377 2358
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..59a0059b39e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
30#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h> 31#include <linux/hugetlb_cgroup.h>
32#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
34#include "internal.h" 33#include "internal.h"
35 34
36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 35const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
637 h->surplus_huge_pages--; 636 h->surplus_huge_pages--;
638 h->surplus_huge_pages_node[nid]--; 637 h->surplus_huge_pages_node[nid]--;
639 } else { 638 } else {
639 arch_clear_hugepage_flags(page);
640 enqueue_huge_page(h, page); 640 enqueue_huge_page(h, page);
641 } 641 }
642 spin_unlock(&hugetlb_lock); 642 spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
671 } 671 }
672} 672}
673 673
674/*
675 * PageHuge() only returns true for hugetlbfs pages, but not for normal or
676 * transparent huge pages. See the PageTransHuge() documentation for more
677 * details.
678 */
674int PageHuge(struct page *page) 679int PageHuge(struct page *page)
675{ 680{
676 compound_page_dtor *dtor; 681 compound_page_dtor *dtor;
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2355 struct page *page; 2360 struct page *page;
2356 struct hstate *h = hstate_vma(vma); 2361 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2362 unsigned long sz = huge_page_size(h);
2363 const unsigned long mmun_start = start; /* For mmu_notifiers */
2364 const unsigned long mmun_end = end; /* For mmu_notifiers */
2358 2365
2359 WARN_ON(!is_vm_hugetlb_page(vma)); 2366 WARN_ON(!is_vm_hugetlb_page(vma));
2360 BUG_ON(start & ~huge_page_mask(h)); 2367 BUG_ON(start & ~huge_page_mask(h));
2361 BUG_ON(end & ~huge_page_mask(h)); 2368 BUG_ON(end & ~huge_page_mask(h));
2362 2369
2363 tlb_start_vma(tlb, vma); 2370 tlb_start_vma(tlb, vma);
2364 mmu_notifier_invalidate_range_start(mm, start, end); 2371 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2365again: 2372again:
2366 spin_lock(&mm->page_table_lock); 2373 spin_lock(&mm->page_table_lock);
2367 for (address = start; address < end; address += sz) { 2374 for (address = start; address < end; address += sz) {
@@ -2425,7 +2432,7 @@ again:
2425 if (address < end && !ref_page) 2432 if (address < end && !ref_page)
2426 goto again; 2433 goto again;
2427 } 2434 }
2428 mmu_notifier_invalidate_range_end(mm, start, end); 2435 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2429 tlb_end_vma(tlb, vma); 2436 tlb_end_vma(tlb, vma);
2430} 2437}
2431 2438
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2473 struct hstate *h = hstate_vma(vma); 2480 struct hstate *h = hstate_vma(vma);
2474 struct vm_area_struct *iter_vma; 2481 struct vm_area_struct *iter_vma;
2475 struct address_space *mapping; 2482 struct address_space *mapping;
2476 struct prio_tree_iter iter;
2477 pgoff_t pgoff; 2483 pgoff_t pgoff;
2478 2484
2479 /* 2485 /*
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2481 * from page cache lookup which is in HPAGE_SIZE units. 2487 * from page cache lookup which is in HPAGE_SIZE units.
2482 */ 2488 */
2483 address = address & huge_page_mask(h); 2489 address = address & huge_page_mask(h);
2484 pgoff = vma_hugecache_offset(h, vma, address); 2490 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
2491 vma->vm_pgoff;
2485 mapping = vma->vm_file->f_dentry->d_inode->i_mapping; 2492 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2486 2493
2487 /* 2494 /*
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2490 * __unmap_hugepage_range() is called as the lock is already held 2497 * __unmap_hugepage_range() is called as the lock is already held
2491 */ 2498 */
2492 mutex_lock(&mapping->i_mmap_mutex); 2499 mutex_lock(&mapping->i_mmap_mutex);
2493 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2500 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
2494 /* Do not unmap the current VMA */ 2501 /* Do not unmap the current VMA */
2495 if (iter_vma == vma) 2502 if (iter_vma == vma)
2496 continue; 2503 continue;
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2525 struct page *old_page, *new_page; 2532 struct page *old_page, *new_page;
2526 int avoidcopy; 2533 int avoidcopy;
2527 int outside_reserve = 0; 2534 int outside_reserve = 0;
2535 unsigned long mmun_start; /* For mmu_notifiers */
2536 unsigned long mmun_end; /* For mmu_notifiers */
2528 2537
2529 old_page = pte_page(pte); 2538 old_page = pte_page(pte);
2530 2539
@@ -2611,6 +2620,9 @@ retry_avoidcopy:
2611 pages_per_huge_page(h)); 2620 pages_per_huge_page(h));
2612 __SetPageUptodate(new_page); 2621 __SetPageUptodate(new_page);
2613 2622
2623 mmun_start = address & huge_page_mask(h);
2624 mmun_end = mmun_start + huge_page_size(h);
2625 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2614 /* 2626 /*
2615 * Retake the page_table_lock to check for racing updates 2627 * Retake the page_table_lock to check for racing updates
2616 * before the page tables are altered 2628 * before the page tables are altered
@@ -2619,9 +2631,6 @@ retry_avoidcopy:
2619 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2631 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2620 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2632 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2621 /* Break COW */ 2633 /* Break COW */
2622 mmu_notifier_invalidate_range_start(mm,
2623 address & huge_page_mask(h),
2624 (address & huge_page_mask(h)) + huge_page_size(h));
2625 huge_ptep_clear_flush(vma, address, ptep); 2634 huge_ptep_clear_flush(vma, address, ptep);
2626 set_huge_pte_at(mm, address, ptep, 2635 set_huge_pte_at(mm, address, ptep,
2627 make_huge_pte(vma, new_page, 1)); 2636 make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2638,11 @@ retry_avoidcopy:
2629 hugepage_add_new_anon_rmap(new_page, vma, address); 2638 hugepage_add_new_anon_rmap(new_page, vma, address);
2630 /* Make the old page be freed below */ 2639 /* Make the old page be freed below */
2631 new_page = old_page; 2640 new_page = old_page;
2632 mmu_notifier_invalidate_range_end(mm,
2633 address & huge_page_mask(h),
2634 (address & huge_page_mask(h)) + huge_page_size(h));
2635 } 2641 }
2642 spin_unlock(&mm->page_table_lock);
2643 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2644 /* Caller expects lock to be held */
2645 spin_lock(&mm->page_table_lock);
2636 page_cache_release(new_page); 2646 page_cache_release(new_page);
2637 page_cache_release(old_page); 2647 page_cache_release(old_page);
2638 return 0; 2648 return 0;
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..a4fa284f6bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,26 +118,27 @@ struct compact_control {
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
122 unsigned long migrate_pfn; /* isolate_migratepages search base */ 121 unsigned long migrate_pfn; /* isolate_migratepages search base */
123 bool sync; /* Synchronous migration */ 122 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are 123 bool ignore_skip_hint; /* Scan blocks even if marked skip */
125 incremental, once free_pfn 124 bool finished_update_free; /* True when the zone cached pfns are
126 and migrate_pfn meet, we restart 125 * no longer being updated
127 from the top of the zone; 126 */
128 remember we wrapped around. */ 127 bool finished_update_migrate;
129 128
130 int order; /* order a direct compactor needs */ 129 int order; /* order a direct compactor needs */
131 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 130 int migratetype; /* MOVABLE, RECLAIMABLE etc */
132 struct zone *zone; 131 struct zone *zone;
133 bool *contended; /* True if a lock was contended */ 132 bool contended; /* True if a lock was contended */
133 struct page **page; /* Page captured of requested size */
134}; 134};
135 135
136unsigned long 136unsigned long
137isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); 137isolate_freepages_range(struct compact_control *cc,
138 unsigned long start_pfn, unsigned long end_pfn);
138unsigned long 139unsigned long
139isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 140isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
140 unsigned long low_pfn, unsigned long end_pfn); 141 unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
141 142
142#endif 143#endif
143 144
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
167} 168}
168 169
169/* 170/*
170 * Called only in fault path via page_evictable() for a new page 171 * Called only in fault path, to determine if a new page is being
171 * to determine if it's being mapped into a LOCKED vma. 172 * mapped into a LOCKED vma. If it is, mark page as mlocked.
172 * If so, mark page as mlocked.
173 */ 173 */
174static inline int mlocked_vma_newpage(struct vm_area_struct *vma, 174static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
175 struct page *page) 175 struct page *page)
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
180 return 0; 180 return 0;
181 181
182 if (!TestSetPageMlocked(page)) { 182 if (!TestSetPageMlocked(page)) {
183 inc_zone_page_state(page, NR_MLOCK); 183 mod_zone_page_state(page_zone(page), NR_MLOCK,
184 hpage_nr_pages(page));
184 count_vm_event(UNEVICTABLE_PGMLOCKED); 185 count_vm_event(UNEVICTABLE_PGMLOCKED);
185 } 186 }
186 return 1; 187 return 1;
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page);
201 * If called for a page that is still mapped by mlocked vmas, all we do 202 * If called for a page that is still mapped by mlocked vmas, all we do
202 * is revert to lazy LRU behaviour -- semantics are not broken. 203 * is revert to lazy LRU behaviour -- semantics are not broken.
203 */ 204 */
204extern void __clear_page_mlock(struct page *page); 205extern void clear_page_mlock(struct page *page);
205static inline void clear_page_mlock(struct page *page)
206{
207 if (unlikely(TestClearPageMlocked(page)))
208 __clear_page_mlock(page);
209}
210 206
211/* 207/*
212 * mlock_migrate_page - called only from migrate_page_copy() to 208 * mlock_migrate_page - called only from migrate_page_copy() to
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
340#define ZONE_RECLAIM_FULL -1 336#define ZONE_RECLAIM_FULL -1
341#define ZONE_RECLAIM_SOME 0 337#define ZONE_RECLAIM_SOME 0
342#define ZONE_RECLAIM_SUCCESS 1 338#define ZONE_RECLAIM_SUCCESS 1
343#endif
344 339
345extern int hwpoison_filter(struct page *p); 340extern int hwpoison_filter(struct page *p);
346 341
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
356 unsigned long, unsigned long); 351 unsigned long, unsigned long);
357 352
358extern void set_pageblock_order(void); 353extern void set_pageblock_order(void);
354unsigned long reclaim_clean_pages_from_list(struct zone *zone,
355 struct list_head *page_list);
356/* The ALLOC_WMARK bits are used as an index to zone->watermark */
357#define ALLOC_WMARK_MIN WMARK_MIN
358#define ALLOC_WMARK_LOW WMARK_LOW
359#define ALLOC_WMARK_HIGH WMARK_HIGH
360#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
361
362/* Mask to get the watermark bits */
363#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
364
365#define ALLOC_HARDER 0x10 /* try to alloc harder */
366#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
367#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
368#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
369
370#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..4a5822a586e6
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
1/*
2 * mm/interval_tree.c - interval tree for mapping->i_mmap
3 *
4 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
5 *
6 * This file is released under the GPL v2.
7 */
8
9#include <linux/mm.h>
10#include <linux/fs.h>
11#include <linux/rmap.h>
12#include <linux/interval_tree_generic.h>
13
14static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
15{
16 return v->vm_pgoff;
17}
18
19static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
20{
21 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
22}
23
24INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
25 unsigned long, shared.linear.rb_subtree_last,
26 vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
27
28/* Insert node immediately after prev in the interval tree */
29void vma_interval_tree_insert_after(struct vm_area_struct *node,
30 struct vm_area_struct *prev,
31 struct rb_root *root)
32{
33 struct rb_node **link;
34 struct vm_area_struct *parent;
35 unsigned long last = vma_last_pgoff(node);
36
37 VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
38
39 if (!prev->shared.linear.rb.rb_right) {
40 parent = prev;
41 link = &prev->shared.linear.rb.rb_right;
42 } else {
43 parent = rb_entry(prev->shared.linear.rb.rb_right,
44 struct vm_area_struct, shared.linear.rb);
45 if (parent->shared.linear.rb_subtree_last < last)
46 parent->shared.linear.rb_subtree_last = last;
47 while (parent->shared.linear.rb.rb_left) {
48 parent = rb_entry(parent->shared.linear.rb.rb_left,
49 struct vm_area_struct, shared.linear.rb);
50 if (parent->shared.linear.rb_subtree_last < last)
51 parent->shared.linear.rb_subtree_last = last;
52 }
53 link = &parent->shared.linear.rb.rb_left;
54 }
55
56 node->shared.linear.rb_subtree_last = last;
57 rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
58 rb_insert_augmented(&node->shared.linear.rb, root,
59 &vma_interval_tree_augment);
60}
61
62static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
63{
64 return vma_start_pgoff(avc->vma);
65}
66
67static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
68{
69 return vma_last_pgoff(avc->vma);
70}
71
72INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
73 avc_start_pgoff, avc_last_pgoff,
74 static inline, __anon_vma_interval_tree)
75
76void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
77 struct rb_root *root)
78{
79#ifdef CONFIG_DEBUG_VM_RB
80 node->cached_vma_start = avc_start_pgoff(node);
81 node->cached_vma_last = avc_last_pgoff(node);
82#endif
83 __anon_vma_interval_tree_insert(node, root);
84}
85
86void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
87 struct rb_root *root)
88{
89 __anon_vma_interval_tree_remove(node, root);
90}
91
92struct anon_vma_chain *
93anon_vma_interval_tree_iter_first(struct rb_root *root,
94 unsigned long first, unsigned long last)
95{
96 return __anon_vma_interval_tree_iter_first(root, first, last);
97}
98
99struct anon_vma_chain *
100anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
101 unsigned long first, unsigned long last)
102{
103 return __anon_vma_interval_tree_iter_next(node, first, last);
104}
105
106#ifdef CONFIG_DEBUG_VM_RB
107void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
108{
109 WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
110 WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
111}
112#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf38..a217cc544060 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
29 * - kmemleak_lock (rwlock): protects the object_list modifications and 29 * - kmemleak_lock (rwlock): protects the object_list modifications and
30 * accesses to the object_tree_root. The object_list is the main list 30 * accesses to the object_tree_root. The object_list is the main list
31 * holding the metadata (struct kmemleak_object) for the allocated memory 31 * holding the metadata (struct kmemleak_object) for the allocated memory
32 * blocks. The object_tree_root is a priority search tree used to look-up 32 * blocks. The object_tree_root is a red black tree used to look-up
33 * metadata based on a pointer to the corresponding memory block. The 33 * metadata based on a pointer to the corresponding memory block. The
34 * kmemleak_object structures are added to the object_list and 34 * kmemleak_object structures are added to the object_list and
35 * object_tree_root in the create_object() function called from the 35 * object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
71#include <linux/delay.h> 71#include <linux/delay.h>
72#include <linux/export.h> 72#include <linux/export.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/rbtree.h>
75#include <linux/fs.h> 75#include <linux/fs.h>
76#include <linux/debugfs.h> 76#include <linux/debugfs.h>
77#include <linux/seq_file.h> 77#include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
132 * Structure holding the metadata for each allocated memory block. 132 * Structure holding the metadata for each allocated memory block.
133 * Modifications to such objects should be made while holding the 133 * Modifications to such objects should be made while holding the
134 * object->lock. Insertions or deletions from object_list, gray_list or 134 * object->lock. Insertions or deletions from object_list, gray_list or
135 * tree_node are already protected by the corresponding locks or mutex (see 135 * rb_node are already protected by the corresponding locks or mutex (see
136 * the notes on locking above). These objects are reference-counted 136 * the notes on locking above). These objects are reference-counted
137 * (use_count) and freed using the RCU mechanism. 137 * (use_count) and freed using the RCU mechanism.
138 */ 138 */
@@ -141,7 +141,7 @@ struct kmemleak_object {
141 unsigned long flags; /* object status flags */ 141 unsigned long flags; /* object status flags */
142 struct list_head object_list; 142 struct list_head object_list;
143 struct list_head gray_list; 143 struct list_head gray_list;
144 struct prio_tree_node tree_node; 144 struct rb_node rb_node;
145 struct rcu_head rcu; /* object_list lockless traversal */ 145 struct rcu_head rcu; /* object_list lockless traversal */
146 /* object usage count; object freed when use_count == 0 */ 146 /* object usage count; object freed when use_count == 0 */
147 atomic_t use_count; 147 atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
182static LIST_HEAD(object_list); 182static LIST_HEAD(object_list);
183/* the list of gray-colored objects (see color_gray comment below) */ 183/* the list of gray-colored objects (see color_gray comment below) */
184static LIST_HEAD(gray_list); 184static LIST_HEAD(gray_list);
185/* prio search tree for object boundaries */ 185/* search tree for object boundaries */
186static struct prio_tree_root object_tree_root; 186static struct rb_root object_tree_root = RB_ROOT;
187/* rw_lock protecting the access to object_list and prio_tree_root */ 187/* rw_lock protecting the access to object_list and object_tree_root */
188static DEFINE_RWLOCK(kmemleak_lock); 188static DEFINE_RWLOCK(kmemleak_lock);
189 189
190/* allocation caches for kmemleak internal data */ 190/* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
380 trace.entries = object->trace; 380 trace.entries = object->trace;
381 381
382 pr_notice("Object 0x%08lx (size %zu):\n", 382 pr_notice("Object 0x%08lx (size %zu):\n",
383 object->tree_node.start, object->size); 383 object->pointer, object->size);
384 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", 384 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
385 object->comm, object->pid, object->jiffies); 385 object->comm, object->pid, object->jiffies);
386 pr_notice(" min_count = %d\n", object->min_count); 386 pr_notice(" min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
392} 392}
393 393
394/* 394/*
395 * Look-up a memory block metadata (kmemleak_object) in the priority search 395 * Look-up a memory block metadata (kmemleak_object) in the object search
396 * tree based on a pointer value. If alias is 0, only values pointing to the 396 * tree based on a pointer value. If alias is 0, only values pointing to the
397 * beginning of the memory block are allowed. The kmemleak_lock must be held 397 * beginning of the memory block are allowed. The kmemleak_lock must be held
398 * when calling this function. 398 * when calling this function.
399 */ 399 */
400static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) 400static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
401{ 401{
402 struct prio_tree_node *node; 402 struct rb_node *rb = object_tree_root.rb_node;
403 struct prio_tree_iter iter; 403
404 struct kmemleak_object *object; 404 while (rb) {
405 405 struct kmemleak_object *object =
406 prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); 406 rb_entry(rb, struct kmemleak_object, rb_node);
407 node = prio_tree_next(&iter); 407 if (ptr < object->pointer)
408 if (node) { 408 rb = object->rb_node.rb_left;
409 object = prio_tree_entry(node, struct kmemleak_object, 409 else if (object->pointer + object->size <= ptr)
410 tree_node); 410 rb = object->rb_node.rb_right;
411 if (!alias && object->pointer != ptr) { 411 else if (object->pointer == ptr || alias)
412 return object;
413 else {
412 kmemleak_warn("Found object by alias at 0x%08lx\n", 414 kmemleak_warn("Found object by alias at 0x%08lx\n",
413 ptr); 415 ptr);
414 dump_object_info(object); 416 dump_object_info(object);
415 object = NULL; 417 break;
416 } 418 }
417 } else 419 }
418 object = NULL; 420 return NULL;
419
420 return object;
421} 421}
422 422
423/* 423/*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
471} 471}
472 472
473/* 473/*
474 * Look up an object in the prio search tree and increase its use_count. 474 * Look up an object in the object search tree and increase its use_count.
475 */ 475 */
476static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) 476static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
477{ 477{
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
516 int min_count, gfp_t gfp) 516 int min_count, gfp_t gfp)
517{ 517{
518 unsigned long flags; 518 unsigned long flags;
519 struct kmemleak_object *object; 519 struct kmemleak_object *object, *parent;
520 struct prio_tree_node *node; 520 struct rb_node **link, *rb_parent;
521 521
522 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); 522 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
523 if (!object) { 523 if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
560 /* kernel backtrace */ 560 /* kernel backtrace */
561 object->trace_len = __save_stack_trace(object->trace); 561 object->trace_len = __save_stack_trace(object->trace);
562 562
563 INIT_PRIO_TREE_NODE(&object->tree_node);
564 object->tree_node.start = ptr;
565 object->tree_node.last = ptr + size - 1;
566
567 write_lock_irqsave(&kmemleak_lock, flags); 563 write_lock_irqsave(&kmemleak_lock, flags);
568 564
569 min_addr = min(min_addr, ptr); 565 min_addr = min(min_addr, ptr);
570 max_addr = max(max_addr, ptr + size); 566 max_addr = max(max_addr, ptr + size);
571 node = prio_tree_insert(&object_tree_root, &object->tree_node); 567 link = &object_tree_root.rb_node;
572 /* 568 rb_parent = NULL;
573 * The code calling the kernel does not yet have the pointer to the 569 while (*link) {
574 * memory block to be able to free it. However, we still hold the 570 rb_parent = *link;
575 * kmemleak_lock here in case parts of the kernel started freeing 571 parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
576 * random memory blocks. 572 if (ptr + size <= parent->pointer)
577 */ 573 link = &parent->rb_node.rb_left;
578 if (node != &object->tree_node) { 574 else if (parent->pointer + parent->size <= ptr)
579 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 575 link = &parent->rb_node.rb_right;
580 "(already existing)\n", ptr); 576 else {
581 object = lookup_object(ptr, 1); 577 kmemleak_stop("Cannot insert 0x%lx into the object "
582 spin_lock(&object->lock); 578 "search tree (overlaps existing)\n",
583 dump_object_info(object); 579 ptr);
584 spin_unlock(&object->lock); 580 kmem_cache_free(object_cache, object);
585 581 object = parent;
586 goto out; 582 spin_lock(&object->lock);
583 dump_object_info(object);
584 spin_unlock(&object->lock);
585 goto out;
586 }
587 } 587 }
588 rb_link_node(&object->rb_node, rb_parent, link);
589 rb_insert_color(&object->rb_node, &object_tree_root);
590
588 list_add_tail_rcu(&object->object_list, &object_list); 591 list_add_tail_rcu(&object->object_list, &object_list);
589out: 592out:
590 write_unlock_irqrestore(&kmemleak_lock, flags); 593 write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
600 unsigned long flags; 603 unsigned long flags;
601 604
602 write_lock_irqsave(&kmemleak_lock, flags); 605 write_lock_irqsave(&kmemleak_lock, flags);
603 prio_tree_remove(&object_tree_root, &object->tree_node); 606 rb_erase(&object->rb_node, &object_tree_root);
604 list_del_rcu(&object->object_list); 607 list_del_rcu(&object->object_list);
605 write_unlock_irqrestore(&kmemleak_lock, flags); 608 write_unlock_irqrestore(&kmemleak_lock, flags);
606 609
@@ -1483,13 +1486,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1483{ 1486{
1484 struct kmemleak_object *prev_obj = v; 1487 struct kmemleak_object *prev_obj = v;
1485 struct kmemleak_object *next_obj = NULL; 1488 struct kmemleak_object *next_obj = NULL;
1486 struct list_head *n = &prev_obj->object_list; 1489 struct kmemleak_object *obj = prev_obj;
1487 1490
1488 ++(*pos); 1491 ++(*pos);
1489 1492
1490 list_for_each_continue_rcu(n, &object_list) { 1493 list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
1491 struct kmemleak_object *obj =
1492 list_entry(n, struct kmemleak_object, object_list);
1493 if (get_object(obj)) { 1494 if (get_object(obj)) {
1494 next_obj = obj; 1495 next_obj = obj;
1495 break; 1496 break;
@@ -1768,7 +1769,6 @@ void __init kmemleak_init(void)
1768 1769
1769 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); 1770 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1770 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); 1771 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1771 INIT_PRIO_TREE_ROOT(&object_tree_root);
1772 1772
1773 if (crt_early_log >= ARRAY_SIZE(early_log)) 1773 if (crt_early_log >= ARRAY_SIZE(early_log))
1774 pr_warning("Early log buffer exceeded (%d), please increase " 1774 pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..ae539f0b8aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
709 spinlock_t *ptl; 709 spinlock_t *ptl;
710 int swapped; 710 int swapped;
711 int err = -EFAULT; 711 int err = -EFAULT;
712 unsigned long mmun_start; /* For mmu_notifiers */
713 unsigned long mmun_end; /* For mmu_notifiers */
712 714
713 addr = page_address_in_vma(page, vma); 715 addr = page_address_in_vma(page, vma);
714 if (addr == -EFAULT) 716 if (addr == -EFAULT)
715 goto out; 717 goto out;
716 718
717 BUG_ON(PageTransCompound(page)); 719 BUG_ON(PageTransCompound(page));
720
721 mmun_start = addr;
722 mmun_end = addr + PAGE_SIZE;
723 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
724
718 ptep = page_check_address(page, mm, addr, &ptl, 0); 725 ptep = page_check_address(page, mm, addr, &ptl, 0);
719 if (!ptep) 726 if (!ptep)
720 goto out; 727 goto out_mn;
721 728
722 if (pte_write(*ptep) || pte_dirty(*ptep)) { 729 if (pte_write(*ptep) || pte_dirty(*ptep)) {
723 pte_t entry; 730 pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
752 759
753out_unlock: 760out_unlock:
754 pte_unmap_unlock(ptep, ptl); 761 pte_unmap_unlock(ptep, ptl);
762out_mn:
763 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
755out: 764out:
756 return err; 765 return err;
757} 766}
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
776 spinlock_t *ptl; 785 spinlock_t *ptl;
777 unsigned long addr; 786 unsigned long addr;
778 int err = -EFAULT; 787 int err = -EFAULT;
788 unsigned long mmun_start; /* For mmu_notifiers */
789 unsigned long mmun_end; /* For mmu_notifiers */
779 790
780 addr = page_address_in_vma(page, vma); 791 addr = page_address_in_vma(page, vma);
781 if (addr == -EFAULT) 792 if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
794 if (!pmd_present(*pmd)) 805 if (!pmd_present(*pmd))
795 goto out; 806 goto out;
796 807
808 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE;
810 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
811
797 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 812 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
798 if (!pte_same(*ptep, orig_pte)) { 813 if (!pte_same(*ptep, orig_pte)) {
799 pte_unmap_unlock(ptep, ptl); 814 pte_unmap_unlock(ptep, ptl);
800 goto out; 815 goto out_mn;
801 } 816 }
802 817
803 get_page(kpage); 818 get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
814 829
815 pte_unmap_unlock(ptep, ptl); 830 pte_unmap_unlock(ptep, ptl);
816 err = 0; 831 err = 0;
832out_mn:
833 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
817out: 834out:
818 return err; 835 return err;
819} 836}
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1469 */ 1486 */
1470 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1487 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1471 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1488 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1472 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1489 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
1473 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
1474 return 0; /* just ignore the advice */ 1490 return 0; /* just ignore the advice */
1475 1491
1492#ifdef VM_SAO
1493 if (*vm_flags & VM_SAO)
1494 return 0;
1495#endif
1496
1476 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1497 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1477 err = __ksm_enter(mm); 1498 err = __ksm_enter(mm);
1478 if (err) 1499 if (err)
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
1582 SetPageSwapBacked(new_page); 1603 SetPageSwapBacked(new_page);
1583 __set_page_locked(new_page); 1604 __set_page_locked(new_page);
1584 1605
1585 if (page_evictable(new_page, vma)) 1606 if (!mlocked_vma_newpage(vma, new_page))
1586 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); 1607 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1587 else 1608 else
1588 add_page_to_unevictable_list(new_page); 1609 add_page_to_unevictable_list(new_page);
@@ -1614,7 +1635,8 @@ again:
1614 struct vm_area_struct *vma; 1635 struct vm_area_struct *vma;
1615 1636
1616 anon_vma_lock(anon_vma); 1637 anon_vma_lock(anon_vma);
1617 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1638 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1639 0, ULONG_MAX) {
1618 vma = vmac->vma; 1640 vma = vmac->vma;
1619 if (rmap_item->address < vma->vm_start || 1641 if (rmap_item->address < vma->vm_start ||
1620 rmap_item->address >= vma->vm_end) 1642 rmap_item->address >= vma->vm_end)
@@ -1667,7 +1689,8 @@ again:
1667 struct vm_area_struct *vma; 1689 struct vm_area_struct *vma;
1668 1690
1669 anon_vma_lock(anon_vma); 1691 anon_vma_lock(anon_vma);
1670 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1692 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1693 0, ULONG_MAX) {
1671 vma = vmac->vma; 1694 vma = vmac->vma;
1672 if (rmap_item->address < vma->vm_start || 1695 if (rmap_item->address < vma->vm_start ||
1673 rmap_item->address >= vma->vm_end) 1696 rmap_item->address >= vma->vm_end)
@@ -1719,7 +1742,8 @@ again:
1719 struct vm_area_struct *vma; 1742 struct vm_area_struct *vma;
1720 1743
1721 anon_vma_lock(anon_vma); 1744 anon_vma_lock(anon_vma);
1722 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1745 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1746 0, ULONG_MAX) {
1723 vma = vmac->vma; 1747 vma = vmac->vma;
1724 if (rmap_item->address < vma->vm_start || 1748 if (rmap_item->address < vma->vm_start ||
1725 rmap_item->address >= vma->vm_end) 1749 rmap_item->address >= vma->vm_end)
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
69 new_flags &= ~VM_DONTCOPY; 69 new_flags &= ~VM_DONTCOPY;
70 break; 70 break;
71 case MADV_DONTDUMP: 71 case MADV_DONTDUMP:
72 new_flags |= VM_NODUMP; 72 new_flags |= VM_DONTDUMP;
73 break; 73 break;
74 case MADV_DODUMP: 74 case MADV_DODUMP:
75 new_flags &= ~VM_NODUMP; 75 if (new_flags & VM_SPECIAL) {
76 error = -EINVAL;
77 goto out;
78 }
79 new_flags &= ~VM_DONTDUMP;
76 break; 80 break;
77 case MADV_MERGEABLE: 81 case MADV_MERGEABLE:
78 case MADV_UNMERGEABLE: 82 case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..625905523c2a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
41static int memblock_reserved_in_slab __initdata_memblock = 0; 41static int memblock_reserved_in_slab __initdata_memblock = 0;
42 42
43/* inline so we don't get a warning when pr_debug is compiled out */ 43/* inline so we don't get a warning when pr_debug is compiled out */
44static inline const char *memblock_type_name(struct memblock_type *type) 44static __init_memblock const char *
45memblock_type_name(struct memblock_type *type)
45{ 46{
46 if (type == &memblock.memory) 47 if (type == &memblock.memory)
47 return "memory"; 48 return "memory";
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
756 return ret; 757 return ret;
757 758
758 for (i = start_rgn; i < end_rgn; i++) 759 for (i = start_rgn; i < end_rgn; i++)
759 type->regions[i].nid = nid; 760 memblock_set_region_node(&type->regions[i], nid);
760 761
761 memblock_merge_regions(type); 762 memblock_merge_regions(type);
762 return 0; 763 return 0;
@@ -929,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
929 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; 930 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
930} 931}
931 932
933void __init_memblock memblock_trim_memory(phys_addr_t align)
934{
935 int i;
936 phys_addr_t start, end, orig_start, orig_end;
937 struct memblock_type *mem = &memblock.memory;
938
939 for (i = 0; i < mem->cnt; i++) {
940 orig_start = mem->regions[i].base;
941 orig_end = mem->regions[i].base + mem->regions[i].size;
942 start = round_up(orig_start, align);
943 end = round_down(orig_end, align);
944
945 if (start == orig_start && end == orig_end)
946 continue;
947
948 if (start < end) {
949 mem->regions[i].base = start;
950 mem->regions[i].size = end - start;
951 } else {
952 memblock_remove_region(mem, i);
953 i--;
954 }
955 }
956}
932 957
933void __init_memblock memblock_set_current_limit(phys_addr_t limit) 958void __init_memblock memblock_set_current_limit(phys_addr_t limit)
934{ 959{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 795e525afaba..7acf43bf04a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
51#include <linux/oom.h> 51#include <linux/oom.h>
52#include "internal.h" 52#include "internal.h"
53#include <net/sock.h> 53#include <net/sock.h>
54#include <net/ip.h>
54#include <net/tcp_memcontrol.h> 55#include <net/tcp_memcontrol.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
326 struct mem_cgroup_stat_cpu nocpu_base; 327 struct mem_cgroup_stat_cpu nocpu_base;
327 spinlock_t pcp_counter_lock; 328 spinlock_t pcp_counter_lock;
328 329
329#ifdef CONFIG_INET 330#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
330 struct tcp_memcontrol tcp_mem; 331 struct tcp_memcontrol tcp_mem;
331#endif 332#endif
332}; 333};
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
411 return container_of(s, struct mem_cgroup, css); 412 return container_of(s, struct mem_cgroup, css);
412} 413}
413 414
415static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
416{
417 return (memcg == root_mem_cgroup);
418}
419
414/* Writing them here to avoid exposing memcg's inner layout */ 420/* Writing them here to avoid exposing memcg's inner layout */
415#ifdef CONFIG_MEMCG_KMEM 421#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
416#include <net/sock.h>
417#include <net/ip.h>
418 422
419static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
420void sock_update_memcg(struct sock *sk) 423void sock_update_memcg(struct sock *sk)
421{ 424{
422 if (mem_cgroup_sockets_enabled) { 425 if (mem_cgroup_sockets_enabled) {
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk)
461 } 464 }
462} 465}
463 466
464#ifdef CONFIG_INET
465struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 467struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
466{ 468{
467 if (!memcg || mem_cgroup_is_root(memcg)) 469 if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
470 return &memcg->tcp_mem.cg_proto; 472 return &memcg->tcp_mem.cg_proto;
471} 473}
472EXPORT_SYMBOL(tcp_proto_cgroup); 474EXPORT_SYMBOL(tcp_proto_cgroup);
473#endif /* CONFIG_INET */
474#endif /* CONFIG_MEMCG_KMEM */
475 475
476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
477static void disarm_sock_keys(struct mem_cgroup *memcg) 476static void disarm_sock_keys(struct mem_cgroup *memcg)
478{ 477{
479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 478 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1016 iter != NULL; \ 1015 iter != NULL; \
1017 iter = mem_cgroup_iter(NULL, iter, NULL)) 1016 iter = mem_cgroup_iter(NULL, iter, NULL))
1018 1017
1019static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
1020{
1021 return (memcg == root_mem_cgroup);
1022}
1023
1024void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1025{ 1019{
1026 struct mem_cgroup *memcg; 1020 struct mem_cgroup *memcg;
@@ -4973,6 +4967,13 @@ mem_cgroup_create(struct cgroup *cont)
4973 } else { 4967 } else {
4974 res_counter_init(&memcg->res, NULL); 4968 res_counter_init(&memcg->res, NULL);
4975 res_counter_init(&memcg->memsw, NULL); 4969 res_counter_init(&memcg->memsw, NULL);
4970 /*
4971 * Deeper hierachy with use_hierarchy == false doesn't make
4972 * much sense so let cgroup subsystem know about this
4973 * unfortunate state in our controller.
4974 */
4975 if (parent && parent != root_mem_cgroup)
4976 mem_cgroup_subsys.broken_hierarchy = true;
4976 } 4977 }
4977 memcg->last_scanned_node = MAX_NUMNODES; 4978 memcg->last_scanned_node = MAX_NUMNODES;
4978 INIT_LIST_HEAD(&memcg->oom_notify); 4979 INIT_LIST_HEAD(&memcg->oom_notify);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..6c5899b9034a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
400 struct vm_area_struct *vma; 400 struct vm_area_struct *vma;
401 struct task_struct *tsk; 401 struct task_struct *tsk;
402 struct anon_vma *av; 402 struct anon_vma *av;
403 pgoff_t pgoff;
403 404
404 av = page_lock_anon_vma(page); 405 av = page_lock_anon_vma(page);
405 if (av == NULL) /* Not actually mapped anymore */ 406 if (av == NULL) /* Not actually mapped anymore */
406 return; 407 return;
407 408
409 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
408 read_lock(&tasklist_lock); 410 read_lock(&tasklist_lock);
409 for_each_process (tsk) { 411 for_each_process (tsk) {
410 struct anon_vma_chain *vmac; 412 struct anon_vma_chain *vmac;
411 413
412 if (!task_early_kill(tsk)) 414 if (!task_early_kill(tsk))
413 continue; 415 continue;
414 list_for_each_entry(vmac, &av->head, same_anon_vma) { 416 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
417 pgoff, pgoff) {
415 vma = vmac->vma; 418 vma = vmac->vma;
416 if (!page_mapped_in_vma(page, vma)) 419 if (!page_mapped_in_vma(page, vma))
417 continue; 420 continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
431{ 434{
432 struct vm_area_struct *vma; 435 struct vm_area_struct *vma;
433 struct task_struct *tsk; 436 struct task_struct *tsk;
434 struct prio_tree_iter iter;
435 struct address_space *mapping = page->mapping; 437 struct address_space *mapping = page->mapping;
436 438
437 mutex_lock(&mapping->i_mmap_mutex); 439 mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
442 if (!task_early_kill(tsk)) 444 if (!task_early_kill(tsk))
443 continue; 445 continue;
444 446
445 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, 447 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
446 pgoff) { 448 pgoff) {
447 /* 449 /*
448 * Send early kill signal to tasks where a vma covers 450 * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..fb135ba4aba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
712 add_taint(TAINT_BAD_PAGE); 712 add_taint(TAINT_BAD_PAGE);
713} 713}
714 714
715static inline int is_cow_mapping(vm_flags_t flags) 715static inline bool is_cow_mapping(vm_flags_t flags)
716{ 716{
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 718}
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1039 unsigned long next; 1039 unsigned long next;
1040 unsigned long addr = vma->vm_start; 1040 unsigned long addr = vma->vm_start;
1041 unsigned long end = vma->vm_end; 1041 unsigned long end = vma->vm_end;
1042 unsigned long mmun_start; /* For mmu_notifiers */
1043 unsigned long mmun_end; /* For mmu_notifiers */
1044 bool is_cow;
1042 int ret; 1045 int ret;
1043 1046
1044 /* 1047 /*
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1047 * readonly mappings. The tradeoff is that copy_page_range is more 1050 * readonly mappings. The tradeoff is that copy_page_range is more
1048 * efficient than faulting. 1051 * efficient than faulting.
1049 */ 1052 */
1050 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { 1053 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1054 VM_PFNMAP | VM_MIXEDMAP))) {
1051 if (!vma->anon_vma) 1055 if (!vma->anon_vma)
1052 return 0; 1056 return 0;
1053 } 1057 }
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1055 if (is_vm_hugetlb_page(vma)) 1059 if (is_vm_hugetlb_page(vma))
1056 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 1060 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1057 1061
1058 if (unlikely(is_pfn_mapping(vma))) { 1062 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1059 /* 1063 /*
1060 * We do not free on error cases below as remove_vma 1064 * We do not free on error cases below as remove_vma
1061 * gets called on error from higher level routine 1065 * gets called on error from higher level routine
1062 */ 1066 */
1063 ret = track_pfn_vma_copy(vma); 1067 ret = track_pfn_copy(vma);
1064 if (ret) 1068 if (ret)
1065 return ret; 1069 return ret;
1066 } 1070 }
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1071 * parent mm. And a permission downgrade will only happen if 1075 * parent mm. And a permission downgrade will only happen if
1072 * is_cow_mapping() returns true. 1076 * is_cow_mapping() returns true.
1073 */ 1077 */
1074 if (is_cow_mapping(vma->vm_flags)) 1078 is_cow = is_cow_mapping(vma->vm_flags);
1075 mmu_notifier_invalidate_range_start(src_mm, addr, end); 1079 mmun_start = addr;
1080 mmun_end = end;
1081 if (is_cow)
1082 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1083 mmun_end);
1076 1084
1077 ret = 0; 1085 ret = 0;
1078 dst_pgd = pgd_offset(dst_mm, addr); 1086 dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1088 } 1096 }
1089 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 1097 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1090 1098
1091 if (is_cow_mapping(vma->vm_flags)) 1099 if (is_cow)
1092 mmu_notifier_invalidate_range_end(src_mm, 1100 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1093 vma->vm_start, end);
1094 return ret; 1101 return ret;
1095} 1102}
1096 1103
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1327 if (vma->vm_file) 1334 if (vma->vm_file)
1328 uprobe_munmap(vma, start, end); 1335 uprobe_munmap(vma, start, end);
1329 1336
1330 if (unlikely(is_pfn_mapping(vma))) 1337 if (unlikely(vma->vm_flags & VM_PFNMAP))
1331 untrack_pfn_vma(vma, 0, 0); 1338 untrack_pfn(vma, 0, 0);
1332 1339
1333 if (start != end) { 1340 if (start != end) {
1334 if (unlikely(is_vm_hugetlb_page(vma))) { 1341 if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1521 spin_unlock(&mm->page_table_lock); 1528 spin_unlock(&mm->page_table_lock);
1522 wait_split_huge_page(vma->anon_vma, pmd); 1529 wait_split_huge_page(vma->anon_vma, pmd);
1523 } else { 1530 } else {
1524 page = follow_trans_huge_pmd(mm, address, 1531 page = follow_trans_huge_pmd(vma, address,
1525 pmd, flags); 1532 pmd, flags);
1526 spin_unlock(&mm->page_table_lock); 1533 spin_unlock(&mm->page_table_lock);
1527 goto out; 1534 goto out;
@@ -1576,12 +1583,12 @@ split_fallthrough:
1576 if (page->mapping && trylock_page(page)) { 1583 if (page->mapping && trylock_page(page)) {
1577 lru_add_drain(); /* push cached pages to LRU */ 1584 lru_add_drain(); /* push cached pages to LRU */
1578 /* 1585 /*
1579 * Because we lock page here and migration is 1586 * Because we lock page here, and migration is
1580 * blocked by the pte's page reference, we need 1587 * blocked by the pte's page reference, and we
1581 * only check for file-cache page truncation. 1588 * know the page is still mapped, we don't even
1589 * need to check for file-cache page truncation.
1582 */ 1590 */
1583 if (page->mapping) 1591 mlock_vma_page(page);
1584 mlock_vma_page(page);
1585 unlock_page(page); 1592 unlock_page(page);
1586 } 1593 }
1587 } 1594 }
@@ -2085,6 +2092,11 @@ out:
2085 * ask for a shared writable mapping! 2092 * ask for a shared writable mapping!
2086 * 2093 *
2087 * The page does not need to be reserved. 2094 * The page does not need to be reserved.
2095 *
2096 * Usually this function is called from f_op->mmap() handler
2097 * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
2098 * Caller must set VM_MIXEDMAP on vma if it wants to call this
2099 * function from other places, for example from page-fault handler.
2088 */ 2100 */
2089int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 2101int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2090 struct page *page) 2102 struct page *page)
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2093 return -EFAULT; 2105 return -EFAULT;
2094 if (!page_count(page)) 2106 if (!page_count(page))
2095 return -EINVAL; 2107 return -EINVAL;
2096 vma->vm_flags |= VM_INSERTPAGE; 2108 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2109 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
2110 BUG_ON(vma->vm_flags & VM_PFNMAP);
2111 vma->vm_flags |= VM_MIXEDMAP;
2112 }
2097 return insert_page(vma, addr, page, vma->vm_page_prot); 2113 return insert_page(vma, addr, page, vma->vm_page_prot);
2098} 2114}
2099EXPORT_SYMBOL(vm_insert_page); 2115EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2148,7 @@ out:
2132 * @addr: target user address of this page 2148 * @addr: target user address of this page
2133 * @pfn: source kernel pfn 2149 * @pfn: source kernel pfn
2134 * 2150 *
2135 * Similar to vm_inert_page, this allows drivers to insert individual pages 2151 * Similar to vm_insert_page, this allows drivers to insert individual pages
2136 * they've allocated into a user vma. Same comments apply. 2152 * they've allocated into a user vma. Same comments apply.
2137 * 2153 *
2138 * This function should only be called from a vm_ops->fault handler, and 2154 * This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2162 2178
2163 if (addr < vma->vm_start || addr >= vma->vm_end) 2179 if (addr < vma->vm_start || addr >= vma->vm_end)
2164 return -EFAULT; 2180 return -EFAULT;
2165 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) 2181 if (track_pfn_insert(vma, &pgprot, pfn))
2166 return -EINVAL; 2182 return -EINVAL;
2167 2183
2168 ret = insert_pfn(vma, addr, pfn, pgprot); 2184 ret = insert_pfn(vma, addr, pfn, pgprot);
2169 2185
2170 if (ret)
2171 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2172
2173 return ret; 2186 return ret;
2174} 2187}
2175EXPORT_SYMBOL(vm_insert_pfn); 2188EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2290 * rest of the world about it: 2303 * rest of the world about it:
2291 * VM_IO tells people not to look at these pages 2304 * VM_IO tells people not to look at these pages
2292 * (accesses can have side effects). 2305 * (accesses can have side effects).
2293 * VM_RESERVED is specified all over the place, because
2294 * in 2.4 it kept swapout's vma scan off this vma; but
2295 * in 2.6 the LRU scan won't even find its pages, so this
2296 * flag means no more than count its pages in reserved_vm,
2297 * and omit it from core dump, even when VM_IO turned off.
2298 * VM_PFNMAP tells the core MM that the base pages are just 2306 * VM_PFNMAP tells the core MM that the base pages are just
2299 * raw PFN mappings, and do not have a "struct page" associated 2307 * raw PFN mappings, and do not have a "struct page" associated
2300 * with them. 2308 * with them.
2309 * VM_DONTEXPAND
2310 * Disable vma merging and expanding with mremap().
2311 * VM_DONTDUMP
2312 * Omit vma from core dump, even when VM_IO turned off.
2301 * 2313 *
2302 * There's a horrible special case to handle copy-on-write 2314 * There's a horrible special case to handle copy-on-write
2303 * behaviour that some programs depend on. We mark the "original" 2315 * behaviour that some programs depend on. We mark the "original"
2304 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 2316 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2317 * See vm_normal_page() for details.
2305 */ 2318 */
2306 if (addr == vma->vm_start && end == vma->vm_end) { 2319 if (is_cow_mapping(vma->vm_flags)) {
2320 if (addr != vma->vm_start || end != vma->vm_end)
2321 return -EINVAL;
2307 vma->vm_pgoff = pfn; 2322 vma->vm_pgoff = pfn;
2308 vma->vm_flags |= VM_PFN_AT_MMAP; 2323 }
2309 } else if (is_cow_mapping(vma->vm_flags))
2310 return -EINVAL;
2311
2312 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2313 2324
2314 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); 2325 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2315 if (err) { 2326 if (err)
2316 /*
2317 * To indicate that track_pfn related cleanup is not
2318 * needed from higher level routine calling unmap_vmas
2319 */
2320 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2321 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2322 return -EINVAL; 2327 return -EINVAL;
2323 } 2328
2329 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2324 2330
2325 BUG_ON(addr >= end); 2331 BUG_ON(addr >= end);
2326 pfn -= addr >> PAGE_SHIFT; 2332 pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2335 } while (pgd++, addr = next, addr != end); 2341 } while (pgd++, addr = next, addr != end);
2336 2342
2337 if (err) 2343 if (err)
2338 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); 2344 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2339 2345
2340 return err; 2346 return err;
2341} 2347}
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2516 spinlock_t *ptl, pte_t orig_pte) 2522 spinlock_t *ptl, pte_t orig_pte)
2517 __releases(ptl) 2523 __releases(ptl)
2518{ 2524{
2519 struct page *old_page, *new_page; 2525 struct page *old_page, *new_page = NULL;
2520 pte_t entry; 2526 pte_t entry;
2521 int ret = 0; 2527 int ret = 0;
2522 int page_mkwrite = 0; 2528 int page_mkwrite = 0;
2523 struct page *dirty_page = NULL; 2529 struct page *dirty_page = NULL;
2530 unsigned long mmun_start; /* For mmu_notifiers */
2531 unsigned long mmun_end; /* For mmu_notifiers */
2532 bool mmun_called = false; /* For mmu_notifiers */
2524 2533
2525 old_page = vm_normal_page(vma, address, orig_pte); 2534 old_page = vm_normal_page(vma, address, orig_pte);
2526 if (!old_page) { 2535 if (!old_page) {
@@ -2698,6 +2707,11 @@ gotten:
2698 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2707 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2699 goto oom_free_new; 2708 goto oom_free_new;
2700 2709
2710 mmun_start = address & PAGE_MASK;
2711 mmun_end = (address & PAGE_MASK) + PAGE_SIZE;
2712 mmun_called = true;
2713 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2714
2701 /* 2715 /*
2702 * Re-check the pte - we dropped the lock 2716 * Re-check the pte - we dropped the lock
2703 */ 2717 */
@@ -2764,6 +2778,8 @@ gotten:
2764 page_cache_release(new_page); 2778 page_cache_release(new_page);
2765unlock: 2779unlock:
2766 pte_unmap_unlock(page_table, ptl); 2780 pte_unmap_unlock(page_table, ptl);
2781 if (mmun_called)
2782 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2767 if (old_page) { 2783 if (old_page) {
2768 /* 2784 /*
2769 * Don't let another task, with possibly unlocked vma, 2785 * Don't let another task, with possibly unlocked vma,
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2801 zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 2817 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2802} 2818}
2803 2819
2804static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2820static inline void unmap_mapping_range_tree(struct rb_root *root,
2805 struct zap_details *details) 2821 struct zap_details *details)
2806{ 2822{
2807 struct vm_area_struct *vma; 2823 struct vm_area_struct *vma;
2808 struct prio_tree_iter iter;
2809 pgoff_t vba, vea, zba, zea; 2824 pgoff_t vba, vea, zba, zea;
2810 2825
2811 vma_prio_tree_foreach(vma, &iter, root, 2826 vma_interval_tree_foreach(vma, root,
2812 details->first_index, details->last_index) { 2827 details->first_index, details->last_index) {
2813 2828
2814 vba = vma->vm_pgoff; 2829 vba = vma->vm_pgoff;
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
2839 * across *all* the pages in each nonlinear VMA, not just the pages 2854 * across *all* the pages in each nonlinear VMA, not just the pages
2840 * whose virtual address lies outside the file truncation point. 2855 * whose virtual address lies outside the file truncation point.
2841 */ 2856 */
2842 list_for_each_entry(vma, head, shared.vm_set.list) { 2857 list_for_each_entry(vma, head, shared.nonlinear) {
2843 details->nonlinear_vma = vma; 2858 details->nonlinear_vma = vma;
2844 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); 2859 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2845 } 2860 }
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping,
2883 2898
2884 2899
2885 mutex_lock(&mapping->i_mmap_mutex); 2900 mutex_lock(&mapping->i_mmap_mutex);
2886 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2901 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2887 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2902 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2888 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2903 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2889 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2904 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..56b758ae57d2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 struct zone *zone;
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
118 __free_pages_bootmem(page, 0); 119 __free_pages_bootmem(page, 0);
120
121 zone = page_zone(page);
122 zone_span_writelock(zone);
123 zone->present_pages++;
124 zone_span_writeunlock(zone);
125 totalram_pages++;
119 } 126 }
120 127
121} 128}
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
362 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 369 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
363 BUG_ON(nr_pages % PAGES_PER_SECTION); 370 BUG_ON(nr_pages % PAGES_PER_SECTION);
364 371
372 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
373
365 sections_to_remove = nr_pages / PAGES_PER_SECTION; 374 sections_to_remove = nr_pages / PAGES_PER_SECTION;
366 for (i = 0; i < sections_to_remove; i++) { 375 for (i = 0; i < sections_to_remove; i++) {
367 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 376 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
368 release_mem_region(pfn << PAGE_SHIFT,
369 PAGES_PER_SECTION << PAGE_SHIFT);
370 ret = __remove_section(zone, __pfn_to_section(pfn)); 377 ret = __remove_section(zone, __pfn_to_section(pfn));
371 if (ret) 378 if (ret)
372 break; 379 break;
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
756 return 0; 763 return 0;
757} 764}
758 765
759static struct page *
760hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
761{
762 /* This should be improooooved!! */
763 return alloc_page(GFP_HIGHUSER_MOVABLE);
764}
765
766#define NR_OFFLINE_AT_ONCE_PAGES (256) 766#define NR_OFFLINE_AT_ONCE_PAGES (256)
767static int 767static int
768do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 768do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
813 putback_lru_pages(&source); 813 putback_lru_pages(&source);
814 goto out; 814 goto out;
815 } 815 }
816 /* this function returns # of failed pages */ 816
817 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 817 /*
818 * alloc_migrate_target should be improooooved!!
819 * migrate_pages returns # of failed pages.
820 */
821 ret = migrate_pages(&source, alloc_migrate_target, 0,
818 true, MIGRATE_SYNC); 822 true, MIGRATE_SYNC);
819 if (ret) 823 if (ret)
820 putback_lru_pages(&source); 824 putback_lru_pages(&source);
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
870 return offlined; 874 return offlined;
871} 875}
872 876
873static int __ref offline_pages(unsigned long start_pfn, 877static int __ref __offline_pages(unsigned long start_pfn,
874 unsigned long end_pfn, unsigned long timeout) 878 unsigned long end_pfn, unsigned long timeout)
875{ 879{
876 unsigned long pfn, nr_pages, expire; 880 unsigned long pfn, nr_pages, expire;
@@ -970,8 +974,13 @@ repeat:
970 974
971 init_per_zone_wmark_min(); 975 init_per_zone_wmark_min();
972 976
973 if (!populated_zone(zone)) 977 if (!populated_zone(zone)) {
974 zone_pcp_reset(zone); 978 zone_pcp_reset(zone);
979 mutex_lock(&zonelists_mutex);
980 build_all_zonelists(NULL, NULL);
981 mutex_unlock(&zonelists_mutex);
982 } else
983 zone_pcp_update(zone);
975 984
976 if (!node_present_pages(node)) { 985 if (!node_present_pages(node)) {
977 node_clear_state(node, N_HIGH_MEMORY); 986 node_clear_state(node, N_HIGH_MEMORY);
@@ -998,15 +1007,55 @@ out:
998 return ret; 1007 return ret;
999} 1008}
1000 1009
1010int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1011{
1012 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1013}
1014
1001int remove_memory(u64 start, u64 size) 1015int remove_memory(u64 start, u64 size)
1002{ 1016{
1017 struct memory_block *mem = NULL;
1018 struct mem_section *section;
1003 unsigned long start_pfn, end_pfn; 1019 unsigned long start_pfn, end_pfn;
1020 unsigned long pfn, section_nr;
1021 int ret;
1004 1022
1005 start_pfn = PFN_DOWN(start); 1023 start_pfn = PFN_DOWN(start);
1006 end_pfn = start_pfn + PFN_DOWN(size); 1024 end_pfn = start_pfn + PFN_DOWN(size);
1007 return offline_pages(start_pfn, end_pfn, 120 * HZ); 1025
1026 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1027 section_nr = pfn_to_section_nr(pfn);
1028 if (!present_section_nr(section_nr))
1029 continue;
1030
1031 section = __nr_to_section(section_nr);
1032 /* same memblock? */
1033 if (mem)
1034 if ((section_nr >= mem->start_section_nr) &&
1035 (section_nr <= mem->end_section_nr))
1036 continue;
1037
1038 mem = find_memory_block_hinted(section, mem);
1039 if (!mem)
1040 continue;
1041
1042 ret = offline_memory_block(mem);
1043 if (ret) {
1044 kobject_put(&mem->dev.kobj);
1045 return ret;
1046 }
1047 }
1048
1049 if (mem)
1050 kobject_put(&mem->dev.kobj);
1051
1052 return 0;
1008} 1053}
1009#else 1054#else
1055int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1056{
1057 return -EINVAL;
1058}
1010int remove_memory(u64 start, u64 size) 1059int remove_memory(u64 start, u64 size)
1011{ 1060{
1012 return -EINVAL; 1061 return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..d04a8a54c294 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
607 return first; 607 return first;
608} 608}
609 609
610/*
611 * Apply policy to a single VMA
612 * This must be called with the mmap_sem held for writing.
613 */
614static int vma_replace_policy(struct vm_area_struct *vma,
615 struct mempolicy *pol)
616{
617 int err;
618 struct mempolicy *old;
619 struct mempolicy *new;
620
621 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
622 vma->vm_start, vma->vm_end, vma->vm_pgoff,
623 vma->vm_ops, vma->vm_file,
624 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
625
626 new = mpol_dup(pol);
627 if (IS_ERR(new))
628 return PTR_ERR(new);
629
630 if (vma->vm_ops && vma->vm_ops->set_policy) {
631 err = vma->vm_ops->set_policy(vma, new);
632 if (err)
633 goto err_out;
634 }
635
636 old = vma->vm_policy;
637 vma->vm_policy = new; /* protected by mmap_sem */
638 mpol_put(old);
639
640 return 0;
641 err_out:
642 mpol_put(new);
643 return err;
644}
645
610/* Step 2: apply policy to a range and do splits. */ 646/* Step 2: apply policy to a range and do splits. */
611static int mbind_range(struct mm_struct *mm, unsigned long start, 647static int mbind_range(struct mm_struct *mm, unsigned long start,
612 unsigned long end, struct mempolicy *new_pol) 648 unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
655 if (err) 691 if (err)
656 goto out; 692 goto out;
657 } 693 }
658 694 err = vma_replace_policy(vma, new_pol);
659 /* 695 if (err)
660 * Apply policy to a single VMA. The reference counting of 696 goto out;
661 * policy for vma_policy linkages has already been handled by
662 * vma_merge and split_vma as necessary. If this is a shared
663 * policy then ->set_policy will increment the reference count
664 * for an sp node.
665 */
666 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
667 vma->vm_start, vma->vm_end, vma->vm_pgoff,
668 vma->vm_ops, vma->vm_file,
669 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
670 if (vma->vm_ops && vma->vm_ops->set_policy) {
671 err = vma->vm_ops->set_policy(vma, new_pol);
672 if (err)
673 goto out;
674 }
675 } 697 }
676 698
677 out: 699 out:
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
924 nodemask_t nmask; 946 nodemask_t nmask;
925 LIST_HEAD(pagelist); 947 LIST_HEAD(pagelist);
926 int err = 0; 948 int err = 0;
927 struct vm_area_struct *vma;
928 949
929 nodes_clear(nmask); 950 nodes_clear(nmask);
930 node_set(source, nmask); 951 node_set(source, nmask);
931 952
932 vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 953 /*
954 * This does not "check" the range but isolates all pages that
955 * need migration. Between passing in the full user address
956 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
957 */
958 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
959 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
933 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 960 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
934 if (IS_ERR(vma))
935 return PTR_ERR(vma);
936 961
937 if (!list_empty(&pagelist)) { 962 if (!list_empty(&pagelist)) {
938 err = migrate_pages(&pagelist, new_node_page, dest, 963 err = migrate_pages(&pagelist, new_node_page, dest,
@@ -1511,9 +1536,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1511 * 1536 *
1512 * Returns effective policy for a VMA at specified address. 1537 * Returns effective policy for a VMA at specified address.
1513 * Falls back to @task or system default policy, as necessary. 1538 * Falls back to @task or system default policy, as necessary.
1514 * Current or other task's task mempolicy and non-shared vma policies 1539 * Current or other task's task mempolicy and non-shared vma policies must be
1515 * are protected by the task's mmap_sem, which must be held for read by 1540 * protected by task_lock(task) by the caller.
1516 * the caller.
1517 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 1541 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1518 * count--added by the get_policy() vm_op, as appropriate--to protect against 1542 * count--added by the get_policy() vm_op, as appropriate--to protect against
1519 * freeing by another task. It is the caller's responsibility to free the 1543 * freeing by another task. It is the caller's responsibility to free the
@@ -1530,8 +1554,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1530 addr); 1554 addr);
1531 if (vpol) 1555 if (vpol)
1532 pol = vpol; 1556 pol = vpol;
1533 } else if (vma->vm_policy) 1557 } else if (vma->vm_policy) {
1534 pol = vma->vm_policy; 1558 pol = vma->vm_policy;
1559
1560 /*
1561 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1562 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1563 * count on these policies which will be dropped by
1564 * mpol_cond_put() later
1565 */
1566 if (mpol_needs_cond_ref(pol))
1567 mpol_get(pol);
1568 }
1535 } 1569 }
1536 if (!pol) 1570 if (!pol)
1537 pol = &default_policy; 1571 pol = &default_policy;
@@ -2061,7 +2095,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2061 */ 2095 */
2062 2096
2063/* lookup first element intersecting start-end */ 2097/* lookup first element intersecting start-end */
2064/* Caller holds sp->lock */ 2098/* Caller holds sp->mutex */
2065static struct sp_node * 2099static struct sp_node *
2066sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2100sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2067{ 2101{
@@ -2125,36 +2159,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2125 2159
2126 if (!sp->root.rb_node) 2160 if (!sp->root.rb_node)
2127 return NULL; 2161 return NULL;
2128 spin_lock(&sp->lock); 2162 mutex_lock(&sp->mutex);
2129 sn = sp_lookup(sp, idx, idx+1); 2163 sn = sp_lookup(sp, idx, idx+1);
2130 if (sn) { 2164 if (sn) {
2131 mpol_get(sn->policy); 2165 mpol_get(sn->policy);
2132 pol = sn->policy; 2166 pol = sn->policy;
2133 } 2167 }
2134 spin_unlock(&sp->lock); 2168 mutex_unlock(&sp->mutex);
2135 return pol; 2169 return pol;
2136} 2170}
2137 2171
2172static void sp_free(struct sp_node *n)
2173{
2174 mpol_put(n->policy);
2175 kmem_cache_free(sn_cache, n);
2176}
2177
2138static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2178static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2139{ 2179{
2140 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2180 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2141 rb_erase(&n->nd, &sp->root); 2181 rb_erase(&n->nd, &sp->root);
2142 mpol_put(n->policy); 2182 sp_free(n);
2143 kmem_cache_free(sn_cache, n);
2144} 2183}
2145 2184
2146static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2185static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2147 struct mempolicy *pol) 2186 struct mempolicy *pol)
2148{ 2187{
2149 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2188 struct sp_node *n;
2189 struct mempolicy *newpol;
2150 2190
2191 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2151 if (!n) 2192 if (!n)
2152 return NULL; 2193 return NULL;
2194
2195 newpol = mpol_dup(pol);
2196 if (IS_ERR(newpol)) {
2197 kmem_cache_free(sn_cache, n);
2198 return NULL;
2199 }
2200 newpol->flags |= MPOL_F_SHARED;
2201
2153 n->start = start; 2202 n->start = start;
2154 n->end = end; 2203 n->end = end;
2155 mpol_get(pol); 2204 n->policy = newpol;
2156 pol->flags |= MPOL_F_SHARED; /* for unref */ 2205
2157 n->policy = pol;
2158 return n; 2206 return n;
2159} 2207}
2160 2208
@@ -2162,10 +2210,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2162static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 2210static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2163 unsigned long end, struct sp_node *new) 2211 unsigned long end, struct sp_node *new)
2164{ 2212{
2165 struct sp_node *n, *new2 = NULL; 2213 struct sp_node *n;
2214 int ret = 0;
2166 2215
2167restart: 2216 mutex_lock(&sp->mutex);
2168 spin_lock(&sp->lock);
2169 n = sp_lookup(sp, start, end); 2217 n = sp_lookup(sp, start, end);
2170 /* Take care of old policies in the same range. */ 2218 /* Take care of old policies in the same range. */
2171 while (n && n->start < end) { 2219 while (n && n->start < end) {
@@ -2178,16 +2226,14 @@ restart:
2178 } else { 2226 } else {
2179 /* Old policy spanning whole new range. */ 2227 /* Old policy spanning whole new range. */
2180 if (n->end > end) { 2228 if (n->end > end) {
2229 struct sp_node *new2;
2230 new2 = sp_alloc(end, n->end, n->policy);
2181 if (!new2) { 2231 if (!new2) {
2182 spin_unlock(&sp->lock); 2232 ret = -ENOMEM;
2183 new2 = sp_alloc(end, n->end, n->policy); 2233 goto out;
2184 if (!new2)
2185 return -ENOMEM;
2186 goto restart;
2187 } 2234 }
2188 n->end = start; 2235 n->end = start;
2189 sp_insert(sp, new2); 2236 sp_insert(sp, new2);
2190 new2 = NULL;
2191 break; 2237 break;
2192 } else 2238 } else
2193 n->end = start; 2239 n->end = start;
@@ -2198,12 +2244,9 @@ restart:
2198 } 2244 }
2199 if (new) 2245 if (new)
2200 sp_insert(sp, new); 2246 sp_insert(sp, new);
2201 spin_unlock(&sp->lock); 2247out:
2202 if (new2) { 2248 mutex_unlock(&sp->mutex);
2203 mpol_put(new2->policy); 2249 return ret;
2204 kmem_cache_free(sn_cache, new2);
2205 }
2206 return 0;
2207} 2250}
2208 2251
2209/** 2252/**
@@ -2221,7 +2264,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2221 int ret; 2264 int ret;
2222 2265
2223 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2266 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2224 spin_lock_init(&sp->lock); 2267 mutex_init(&sp->mutex);
2225 2268
2226 if (mpol) { 2269 if (mpol) {
2227 struct vm_area_struct pvma; 2270 struct vm_area_struct pvma;
@@ -2275,7 +2318,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
2275 } 2318 }
2276 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 2319 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2277 if (err && new) 2320 if (err && new)
2278 kmem_cache_free(sn_cache, new); 2321 sp_free(new);
2279 return err; 2322 return err;
2280} 2323}
2281 2324
@@ -2287,16 +2330,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
2287 2330
2288 if (!p->root.rb_node) 2331 if (!p->root.rb_node)
2289 return; 2332 return;
2290 spin_lock(&p->lock); 2333 mutex_lock(&p->mutex);
2291 next = rb_first(&p->root); 2334 next = rb_first(&p->root);
2292 while (next) { 2335 while (next) {
2293 n = rb_entry(next, struct sp_node, nd); 2336 n = rb_entry(next, struct sp_node, nd);
2294 next = rb_next(&n->nd); 2337 next = rb_next(&n->nd);
2295 rb_erase(&n->nd, &p->root); 2338 sp_delete(p, n);
2296 mpol_put(n->policy);
2297 kmem_cache_free(sn_cache, n);
2298 } 2339 }
2299 spin_unlock(&p->lock); 2340 mutex_unlock(&p->mutex);
2300} 2341}
2301 2342
2302/* assumes fs == KERNEL_DS */ 2343/* assumes fs == KERNEL_DS */
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
51/* 51/*
52 * LRU accounting for clear_page_mlock() 52 * LRU accounting for clear_page_mlock()
53 */ 53 */
54void __clear_page_mlock(struct page *page) 54void clear_page_mlock(struct page *page)
55{ 55{
56 VM_BUG_ON(!PageLocked(page)); 56 if (!TestClearPageMlocked(page))
57
58 if (!page->mapping) { /* truncated ? */
59 return; 57 return;
60 }
61 58
62 dec_zone_page_state(page, NR_MLOCK); 59 mod_zone_page_state(page_zone(page), NR_MLOCK,
60 -hpage_nr_pages(page));
63 count_vm_event(UNEVICTABLE_PGCLEARED); 61 count_vm_event(UNEVICTABLE_PGCLEARED);
64 if (!isolate_lru_page(page)) { 62 if (!isolate_lru_page(page)) {
65 putback_lru_page(page); 63 putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
81 BUG_ON(!PageLocked(page)); 79 BUG_ON(!PageLocked(page));
82 80
83 if (!TestSetPageMlocked(page)) { 81 if (!TestSetPageMlocked(page)) {
84 inc_zone_page_state(page, NR_MLOCK); 82 mod_zone_page_state(page_zone(page), NR_MLOCK,
83 hpage_nr_pages(page));
85 count_vm_event(UNEVICTABLE_PGMLOCKED); 84 count_vm_event(UNEVICTABLE_PGMLOCKED);
86 if (!isolate_lru_page(page)) 85 if (!isolate_lru_page(page))
87 putback_lru_page(page); 86 putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
108 BUG_ON(!PageLocked(page)); 107 BUG_ON(!PageLocked(page));
109 108
110 if (TestClearPageMlocked(page)) { 109 if (TestClearPageMlocked(page)) {
111 dec_zone_page_state(page, NR_MLOCK); 110 mod_zone_page_state(page_zone(page), NR_MLOCK,
111 -hpage_nr_pages(page));
112 if (!isolate_lru_page(page)) { 112 if (!isolate_lru_page(page)) {
113 int ret = SWAP_AGAIN; 113 int ret = SWAP_AGAIN;
114 114
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
227 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 227 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
228 goto no_mlock; 228 goto no_mlock;
229 229
230 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 230 if (!((vma->vm_flags & VM_DONTEXPAND) ||
231 is_vm_hugetlb_page(vma) || 231 is_vm_hugetlb_page(vma) ||
232 vma == get_gate_vma(current->mm))) { 232 vma == get_gate_vma(current->mm))) {
233 233
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
290 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 290 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
291 if (page && !IS_ERR(page)) { 291 if (page && !IS_ERR(page)) {
292 lock_page(page); 292 lock_page(page);
293 /* 293 munlock_vma_page(page);
294 * Like in __mlock_vma_pages_range(),
295 * because we lock page here and migration is
296 * blocked by the elevated reference, we need
297 * only check for file-cache page truncation.
298 */
299 if (page->mapping)
300 munlock_vma_page(page);
301 unlock_page(page); 294 unlock_page(page);
302 put_page(page); 295 put_page(page);
303 } 296 }
diff --git a/mm/mmap.c b/mm/mmap.c
index ae18a48e7e4e..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
51 struct vm_area_struct *vma, struct vm_area_struct *prev, 51 struct vm_area_struct *vma, struct vm_area_struct *prev,
52 unsigned long start, unsigned long end); 52 unsigned long start, unsigned long end);
53 53
54/*
55 * WARNING: the debugging will use recursive algorithms so never enable this
56 * unless you know what you are doing.
57 */
58#undef DEBUG_MM_RB
59
60/* description of effects of mapping type and prot in current implementation. 54/* description of effects of mapping type and prot in current implementation.
61 * this is due to the limited x86 page protection hardware. The expected 55 * this is due to the limited x86 page protection hardware. The expected
62 * behavior is in parens: 56 * behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
199 193
200 flush_dcache_mmap_lock(mapping); 194 flush_dcache_mmap_lock(mapping);
201 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 195 if (unlikely(vma->vm_flags & VM_NONLINEAR))
202 list_del_init(&vma->shared.vm_set.list); 196 list_del_init(&vma->shared.nonlinear);
203 else 197 else
204 vma_prio_tree_remove(vma, &mapping->i_mmap); 198 vma_interval_tree_remove(vma, &mapping->i_mmap);
205 flush_dcache_mmap_unlock(mapping); 199 flush_dcache_mmap_unlock(mapping);
206} 200}
207 201
208/* 202/*
209 * Unlink a file-based vm structure from its prio_tree, to hide 203 * Unlink a file-based vm structure from its interval tree, to hide
210 * vma from rmap and vmtruncate before freeing its page tables. 204 * vma from rmap and vmtruncate before freeing its page tables.
211 */ 205 */
212void unlink_file_vma(struct vm_area_struct *vma) 206void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
231 might_sleep(); 225 might_sleep();
232 if (vma->vm_ops && vma->vm_ops->close) 226 if (vma->vm_ops && vma->vm_ops->close)
233 vma->vm_ops->close(vma); 227 vma->vm_ops->close(vma);
234 if (vma->vm_file) { 228 if (vma->vm_file)
235 fput(vma->vm_file); 229 fput(vma->vm_file);
236 if (vma->vm_flags & VM_EXECUTABLE)
237 removed_exe_file_vma(vma->vm_mm);
238 }
239 mpol_put(vma_policy(vma)); 230 mpol_put(vma_policy(vma));
240 kmem_cache_free(vm_area_cachep, vma); 231 kmem_cache_free(vm_area_cachep, vma);
241 return next; 232 return next;
@@ -306,7 +297,7 @@ out:
306 return retval; 297 return retval;
307} 298}
308 299
309#ifdef DEBUG_MM_RB 300#ifdef CONFIG_DEBUG_VM_RB
310static int browse_rb(struct rb_root *root) 301static int browse_rb(struct rb_root *root)
311{ 302{
312 int i = 0, j; 303 int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
340{ 331{
341 int bug = 0; 332 int bug = 0;
342 int i = 0; 333 int i = 0;
343 struct vm_area_struct *tmp = mm->mmap; 334 struct vm_area_struct *vma = mm->mmap;
344 while (tmp) { 335 while (vma) {
345 tmp = tmp->vm_next; 336 struct anon_vma_chain *avc;
337 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
338 anon_vma_interval_tree_verify(avc);
339 vma = vma->vm_next;
346 i++; 340 i++;
347 } 341 }
348 if (i != mm->map_count) 342 if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
356#define validate_mm(mm) do { } while (0) 350#define validate_mm(mm) do { } while (0)
357#endif 351#endif
358 352
359static struct vm_area_struct * 353/*
360find_vma_prepare(struct mm_struct *mm, unsigned long addr, 354 * vma has some anon_vma assigned, and is already inserted on that
361 struct vm_area_struct **pprev, struct rb_node ***rb_link, 355 * anon_vma's interval trees.
362 struct rb_node ** rb_parent) 356 *
357 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
358 * vma must be removed from the anon_vma's interval trees using
359 * anon_vma_interval_tree_pre_update_vma().
360 *
361 * After the update, the vma will be reinserted using
362 * anon_vma_interval_tree_post_update_vma().
363 *
364 * The entire update must be protected by exclusive mmap_sem and by
365 * the root anon_vma's mutex.
366 */
367static inline void
368anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
363{ 369{
364 struct vm_area_struct * vma; 370 struct anon_vma_chain *avc;
365 struct rb_node ** __rb_link, * __rb_parent, * rb_prev; 371
372 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
373 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
374}
375
376static inline void
377anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
378{
379 struct anon_vma_chain *avc;
380
381 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
382 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
383}
384
385static int find_vma_links(struct mm_struct *mm, unsigned long addr,
386 unsigned long end, struct vm_area_struct **pprev,
387 struct rb_node ***rb_link, struct rb_node **rb_parent)
388{
389 struct rb_node **__rb_link, *__rb_parent, *rb_prev;
366 390
367 __rb_link = &mm->mm_rb.rb_node; 391 __rb_link = &mm->mm_rb.rb_node;
368 rb_prev = __rb_parent = NULL; 392 rb_prev = __rb_parent = NULL;
369 vma = NULL;
370 393
371 while (*__rb_link) { 394 while (*__rb_link) {
372 struct vm_area_struct *vma_tmp; 395 struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
375 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 398 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
376 399
377 if (vma_tmp->vm_end > addr) { 400 if (vma_tmp->vm_end > addr) {
378 vma = vma_tmp; 401 /* Fail if an existing vma overlaps the area */
379 if (vma_tmp->vm_start <= addr) 402 if (vma_tmp->vm_start < end)
380 break; 403 return -ENOMEM;
381 __rb_link = &__rb_parent->rb_left; 404 __rb_link = &__rb_parent->rb_left;
382 } else { 405 } else {
383 rb_prev = __rb_parent; 406 rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
390 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 413 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
391 *rb_link = __rb_link; 414 *rb_link = __rb_link;
392 *rb_parent = __rb_parent; 415 *rb_parent = __rb_parent;
393 return vma; 416 return 0;
394} 417}
395 418
396void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 419void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
417 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 440 if (unlikely(vma->vm_flags & VM_NONLINEAR))
418 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 441 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
419 else 442 else
420 vma_prio_tree_insert(vma, &mapping->i_mmap); 443 vma_interval_tree_insert(vma, &mapping->i_mmap);
421 flush_dcache_mmap_unlock(mapping); 444 flush_dcache_mmap_unlock(mapping);
422 } 445 }
423} 446}
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
455 478
456/* 479/*
457 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the 480 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
458 * mm's list and rbtree. It has already been inserted into the prio_tree. 481 * mm's list and rbtree. It has already been inserted into the interval tree.
459 */ 482 */
460static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 483static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
461{ 484{
462 struct vm_area_struct *__vma, *prev; 485 struct vm_area_struct *prev;
463 struct rb_node **rb_link, *rb_parent; 486 struct rb_node **rb_link, *rb_parent;
464 487
465 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); 488 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
466 BUG_ON(__vma && __vma->vm_start < vma->vm_end); 489 &prev, &rb_link, &rb_parent))
490 BUG();
467 __vma_link(mm, vma, prev, rb_link, rb_parent); 491 __vma_link(mm, vma, prev, rb_link, rb_parent);
468 mm->map_count++; 492 mm->map_count++;
469} 493}
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
496 struct vm_area_struct *next = vma->vm_next; 520 struct vm_area_struct *next = vma->vm_next;
497 struct vm_area_struct *importer = NULL; 521 struct vm_area_struct *importer = NULL;
498 struct address_space *mapping = NULL; 522 struct address_space *mapping = NULL;
499 struct prio_tree_root *root = NULL; 523 struct rb_root *root = NULL;
500 struct anon_vma *anon_vma = NULL; 524 struct anon_vma *anon_vma = NULL;
501 struct file *file = vma->vm_file; 525 struct file *file = vma->vm_file;
502 long adjust_next = 0; 526 long adjust_next = 0;
@@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end);
559 mutex_lock(&mapping->i_mmap_mutex); 583 mutex_lock(&mapping->i_mmap_mutex);
560 if (insert) { 584 if (insert) {
561 /* 585 /*
562 * Put into prio_tree now, so instantiated pages 586 * Put into interval tree now, so instantiated pages
563 * are visible to arm/parisc __flush_dcache_page 587 * are visible to arm/parisc __flush_dcache_page
564 * throughout; but we cannot insert into address 588 * throughout; but we cannot insert into address
565 * space until vma start or end is updated. 589 * space until vma start or end is updated.
@@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end);
570 594
571 vma_adjust_trans_huge(vma, start, end, adjust_next); 595 vma_adjust_trans_huge(vma, start, end, adjust_next);
572 596
573 /* 597 anon_vma = vma->anon_vma;
574 * When changing only vma->vm_end, we don't really need anon_vma 598 if (!anon_vma && adjust_next)
575 * lock. This is a fairly rare case by itself, but the anon_vma 599 anon_vma = next->anon_vma;
576 * lock may be shared between many sibling processes. Skipping 600 if (anon_vma) {
577 * the lock for brk adjustments makes a difference sometimes. 601 VM_BUG_ON(adjust_next && next->anon_vma &&
578 */ 602 anon_vma != next->anon_vma);
579 if (vma->anon_vma && (importer || start != vma->vm_start)) {
580 anon_vma = vma->anon_vma;
581 anon_vma_lock(anon_vma); 603 anon_vma_lock(anon_vma);
604 anon_vma_interval_tree_pre_update_vma(vma);
605 if (adjust_next)
606 anon_vma_interval_tree_pre_update_vma(next);
582 } 607 }
583 608
584 if (root) { 609 if (root) {
585 flush_dcache_mmap_lock(mapping); 610 flush_dcache_mmap_lock(mapping);
586 vma_prio_tree_remove(vma, root); 611 vma_interval_tree_remove(vma, root);
587 if (adjust_next) 612 if (adjust_next)
588 vma_prio_tree_remove(next, root); 613 vma_interval_tree_remove(next, root);
589 } 614 }
590 615
591 vma->vm_start = start; 616 vma->vm_start = start;
@@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end);
598 623
599 if (root) { 624 if (root) {
600 if (adjust_next) 625 if (adjust_next)
601 vma_prio_tree_insert(next, root); 626 vma_interval_tree_insert(next, root);
602 vma_prio_tree_insert(vma, root); 627 vma_interval_tree_insert(vma, root);
603 flush_dcache_mmap_unlock(mapping); 628 flush_dcache_mmap_unlock(mapping);
604 } 629 }
605 630
@@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end);
620 __insert_vm_struct(mm, insert); 645 __insert_vm_struct(mm, insert);
621 } 646 }
622 647
623 if (anon_vma) 648 if (anon_vma) {
649 anon_vma_interval_tree_post_update_vma(vma);
650 if (adjust_next)
651 anon_vma_interval_tree_post_update_vma(next);
624 anon_vma_unlock(anon_vma); 652 anon_vma_unlock(anon_vma);
653 }
625 if (mapping) 654 if (mapping)
626 mutex_unlock(&mapping->i_mmap_mutex); 655 mutex_unlock(&mapping->i_mmap_mutex);
627 656
@@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end);
636 if (file) { 665 if (file) {
637 uprobe_munmap(next, next->vm_start, next->vm_end); 666 uprobe_munmap(next, next->vm_start, next->vm_end);
638 fput(file); 667 fput(file);
639 if (next->vm_flags & VM_EXECUTABLE)
640 removed_exe_file_vma(mm);
641 } 668 }
642 if (next->anon_vma) 669 if (next->anon_vma)
643 anon_vma_merge(vma, next); 670 anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end);
669static inline int is_mergeable_vma(struct vm_area_struct *vma, 696static inline int is_mergeable_vma(struct vm_area_struct *vma,
670 struct file *file, unsigned long vm_flags) 697 struct file *file, unsigned long vm_flags)
671{ 698{
672 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ 699 if (vma->vm_flags ^ vm_flags)
673 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
674 return 0; 700 return 0;
675 if (vma->vm_file != file) 701 if (vma->vm_file != file)
676 return 0; 702 return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
951 mm->exec_vm += pages; 977 mm->exec_vm += pages;
952 } else if (flags & stack_flags) 978 } else if (flags & stack_flags)
953 mm->stack_vm += pages; 979 mm->stack_vm += pages;
954 if (flags & (VM_RESERVED|VM_IO))
955 mm->reserved_vm += pages;
956} 980}
957#endif /* CONFIG_PROC_FS */ 981#endif /* CONFIG_PROC_FS */
958 982
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1190 return 0; 1214 return 0;
1191 1215
1192 /* Specialty mapping? */ 1216 /* Specialty mapping? */
1193 if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) 1217 if (vm_flags & VM_PFNMAP)
1194 return 0; 1218 return 0;
1195 1219
1196 /* Can the mapping track the dirty pages? */ 1220 /* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1229 /* Clear old maps */ 1253 /* Clear old maps */
1230 error = -ENOMEM; 1254 error = -ENOMEM;
1231munmap_back: 1255munmap_back:
1232 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 1256 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1233 if (vma && vma->vm_start < addr + len) {
1234 if (do_munmap(mm, addr, len)) 1257 if (do_munmap(mm, addr, len))
1235 return -ENOMEM; 1258 return -ENOMEM;
1236 goto munmap_back; 1259 goto munmap_back;
@@ -1301,13 +1324,10 @@ munmap_back:
1301 goto free_vma; 1324 goto free_vma;
1302 correct_wcount = 1; 1325 correct_wcount = 1;
1303 } 1326 }
1304 vma->vm_file = file; 1327 vma->vm_file = get_file(file);
1305 get_file(file);
1306 error = file->f_op->mmap(file, vma); 1328 error = file->f_op->mmap(file, vma);
1307 if (error) 1329 if (error)
1308 goto unmap_and_free_vma; 1330 goto unmap_and_free_vma;
1309 if (vm_flags & VM_EXECUTABLE)
1310 added_exe_file_vma(mm);
1311 1331
1312 /* Can addr have changed?? 1332 /* Can addr have changed??
1313 * 1333 *
@@ -1758,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1758 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 1778 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1759 error = acct_stack_growth(vma, size, grow); 1779 error = acct_stack_growth(vma, size, grow);
1760 if (!error) { 1780 if (!error) {
1781 anon_vma_interval_tree_pre_update_vma(vma);
1761 vma->vm_end = address; 1782 vma->vm_end = address;
1783 anon_vma_interval_tree_post_update_vma(vma);
1762 perf_event_mmap(vma); 1784 perf_event_mmap(vma);
1763 } 1785 }
1764 } 1786 }
1765 } 1787 }
1766 vma_unlock_anon_vma(vma); 1788 vma_unlock_anon_vma(vma);
1767 khugepaged_enter_vma_merge(vma); 1789 khugepaged_enter_vma_merge(vma);
1790 validate_mm(vma->vm_mm);
1768 return error; 1791 return error;
1769} 1792}
1770#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1793#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
1808 if (grow <= vma->vm_pgoff) { 1831 if (grow <= vma->vm_pgoff) {
1809 error = acct_stack_growth(vma, size, grow); 1832 error = acct_stack_growth(vma, size, grow);
1810 if (!error) { 1833 if (!error) {
1834 anon_vma_interval_tree_pre_update_vma(vma);
1811 vma->vm_start = address; 1835 vma->vm_start = address;
1812 vma->vm_pgoff -= grow; 1836 vma->vm_pgoff -= grow;
1837 anon_vma_interval_tree_post_update_vma(vma);
1813 perf_event_mmap(vma); 1838 perf_event_mmap(vma);
1814 } 1839 }
1815 } 1840 }
1816 } 1841 }
1817 vma_unlock_anon_vma(vma); 1842 vma_unlock_anon_vma(vma);
1818 khugepaged_enter_vma_merge(vma); 1843 khugepaged_enter_vma_merge(vma);
1844 validate_mm(vma->vm_mm);
1819 return error; 1845 return error;
1820} 1846}
1821 1847
@@ -1989,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1989 if (anon_vma_clone(new, vma)) 2015 if (anon_vma_clone(new, vma))
1990 goto out_free_mpol; 2016 goto out_free_mpol;
1991 2017
1992 if (new->vm_file) { 2018 if (new->vm_file)
1993 get_file(new->vm_file); 2019 get_file(new->vm_file);
1994 if (vma->vm_flags & VM_EXECUTABLE)
1995 added_exe_file_vma(mm);
1996 }
1997 2020
1998 if (new->vm_ops && new->vm_ops->open) 2021 if (new->vm_ops && new->vm_ops->open)
1999 new->vm_ops->open(new); 2022 new->vm_ops->open(new);
@@ -2011,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2011 /* Clean everything up if vma_adjust failed. */ 2034 /* Clean everything up if vma_adjust failed. */
2012 if (new->vm_ops && new->vm_ops->close) 2035 if (new->vm_ops && new->vm_ops->close)
2013 new->vm_ops->close(new); 2036 new->vm_ops->close(new);
2014 if (new->vm_file) { 2037 if (new->vm_file)
2015 if (vma->vm_flags & VM_EXECUTABLE)
2016 removed_exe_file_vma(mm);
2017 fput(new->vm_file); 2038 fput(new->vm_file);
2018 }
2019 unlink_anon_vmas(new); 2039 unlink_anon_vmas(new);
2020 out_free_mpol: 2040 out_free_mpol:
2021 mpol_put(pol); 2041 mpol_put(pol);
@@ -2200,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2200 * Clear old maps. this also does some error checking for us 2220 * Clear old maps. this also does some error checking for us
2201 */ 2221 */
2202 munmap_back: 2222 munmap_back:
2203 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2223 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2204 if (vma && vma->vm_start < addr + len) {
2205 if (do_munmap(mm, addr, len)) 2224 if (do_munmap(mm, addr, len))
2206 return -ENOMEM; 2225 return -ENOMEM;
2207 goto munmap_back; 2226 goto munmap_back;
@@ -2315,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm)
2315 * and into the inode's i_mmap tree. If vm_file is non-NULL 2334 * and into the inode's i_mmap tree. If vm_file is non-NULL
2316 * then i_mmap_mutex is taken here. 2335 * then i_mmap_mutex is taken here.
2317 */ 2336 */
2318int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 2337int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2319{ 2338{
2320 struct vm_area_struct * __vma, * prev; 2339 struct vm_area_struct *prev;
2321 struct rb_node ** rb_link, * rb_parent; 2340 struct rb_node **rb_link, *rb_parent;
2322 2341
2323 /* 2342 /*
2324 * The vm_pgoff of a purely anonymous vma should be irrelevant 2343 * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2336,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2336 BUG_ON(vma->anon_vma); 2355 BUG_ON(vma->anon_vma);
2337 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2356 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2338 } 2357 }
2339 __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); 2358 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2340 if (__vma && __vma->vm_start < vma->vm_end) 2359 &prev, &rb_link, &rb_parent))
2341 return -ENOMEM; 2360 return -ENOMEM;
2342 if ((vma->vm_flags & VM_ACCOUNT) && 2361 if ((vma->vm_flags & VM_ACCOUNT) &&
2343 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2362 security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2352,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2352 * prior to moving page table entries, to effect an mremap move. 2371 * prior to moving page table entries, to effect an mremap move.
2353 */ 2372 */
2354struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2373struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2355 unsigned long addr, unsigned long len, pgoff_t pgoff) 2374 unsigned long addr, unsigned long len, pgoff_t pgoff,
2375 bool *need_rmap_locks)
2356{ 2376{
2357 struct vm_area_struct *vma = *vmap; 2377 struct vm_area_struct *vma = *vmap;
2358 unsigned long vma_start = vma->vm_start; 2378 unsigned long vma_start = vma->vm_start;
@@ -2371,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2371 faulted_in_anon_vma = false; 2391 faulted_in_anon_vma = false;
2372 } 2392 }
2373 2393
2374 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2394 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2395 return NULL; /* should never get here */
2375 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2396 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2376 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2397 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2377 if (new_vma) { 2398 if (new_vma) {
@@ -2393,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2393 * linear if there are no pages mapped yet. 2414 * linear if there are no pages mapped yet.
2394 */ 2415 */
2395 VM_BUG_ON(faulted_in_anon_vma); 2416 VM_BUG_ON(faulted_in_anon_vma);
2396 *vmap = new_vma; 2417 *vmap = vma = new_vma;
2397 } else 2418 }
2398 anon_vma_moveto_tail(new_vma); 2419 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2399 } else { 2420 } else {
2400 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2421 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2401 if (new_vma) { 2422 if (new_vma) {
2402 *new_vma = *vma; 2423 *new_vma = *vma;
2424 new_vma->vm_start = addr;
2425 new_vma->vm_end = addr + len;
2426 new_vma->vm_pgoff = pgoff;
2403 pol = mpol_dup(vma_policy(vma)); 2427 pol = mpol_dup(vma_policy(vma));
2404 if (IS_ERR(pol)) 2428 if (IS_ERR(pol))
2405 goto out_free_vma; 2429 goto out_free_vma;
2430 vma_set_policy(new_vma, pol);
2406 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2431 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2407 if (anon_vma_clone(new_vma, vma)) 2432 if (anon_vma_clone(new_vma, vma))
2408 goto out_free_mempol; 2433 goto out_free_mempol;
2409 vma_set_policy(new_vma, pol); 2434 if (new_vma->vm_file)
2410 new_vma->vm_start = addr;
2411 new_vma->vm_end = addr + len;
2412 new_vma->vm_pgoff = pgoff;
2413 if (new_vma->vm_file) {
2414 get_file(new_vma->vm_file); 2435 get_file(new_vma->vm_file);
2415
2416 if (vma->vm_flags & VM_EXECUTABLE)
2417 added_exe_file_vma(mm);
2418 }
2419 if (new_vma->vm_ops && new_vma->vm_ops->open) 2436 if (new_vma->vm_ops && new_vma->vm_ops->open)
2420 new_vma->vm_ops->open(new_vma); 2437 new_vma->vm_ops->open(new_vma);
2421 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2438 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2439 *need_rmap_locks = false;
2422 } 2440 }
2423 } 2441 }
2424 return new_vma; 2442 return new_vma;
@@ -2536,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
2536 2554
2537static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2555static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2538{ 2556{
2539 if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2557 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2540 /* 2558 /*
2541 * The LSB of head.next can't change from under us 2559 * The LSB of head.next can't change from under us
2542 * because we hold the mm_all_locks_mutex. 2560 * because we hold the mm_all_locks_mutex.
@@ -2552,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2552 * anon_vma->root->mutex. 2570 * anon_vma->root->mutex.
2553 */ 2571 */
2554 if (__test_and_set_bit(0, (unsigned long *) 2572 if (__test_and_set_bit(0, (unsigned long *)
2555 &anon_vma->root->head.next)) 2573 &anon_vma->root->rb_root.rb_node))
2556 BUG(); 2574 BUG();
2557 } 2575 }
2558} 2576}
@@ -2593,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2593 * A single task can't take more than one mm_take_all_locks() in a row 2611 * A single task can't take more than one mm_take_all_locks() in a row
2594 * or it would deadlock. 2612 * or it would deadlock.
2595 * 2613 *
2596 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in 2614 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
2597 * mapping->flags avoid to take the same lock twice, if more than one 2615 * mapping->flags avoid to take the same lock twice, if more than one
2598 * vma in this mm is backed by the same anon_vma or address_space. 2616 * vma in this mm is backed by the same anon_vma or address_space.
2599 * 2617 *
@@ -2640,13 +2658,13 @@ out_unlock:
2640 2658
2641static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2659static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2642{ 2660{
2643 if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2661 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2644 /* 2662 /*
2645 * The LSB of head.next can't change to 0 from under 2663 * The LSB of head.next can't change to 0 from under
2646 * us because we hold the mm_all_locks_mutex. 2664 * us because we hold the mm_all_locks_mutex.
2647 * 2665 *
2648 * We must however clear the bitflag before unlocking 2666 * We must however clear the bitflag before unlocking
2649 * the vma so the users using the anon_vma->head will 2667 * the vma so the users using the anon_vma->rb_root will
2650 * never see our bitflag. 2668 * never see our bitflag.
2651 * 2669 *
2652 * No need of atomic instructions here, head.next 2670 * No need of atomic instructions here, head.next
@@ -2654,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2654 * anon_vma->root->mutex. 2672 * anon_vma->root->mutex.
2655 */ 2673 */
2656 if (!__test_and_clear_bit(0, (unsigned long *) 2674 if (!__test_and_clear_bit(0, (unsigned long *)
2657 &anon_vma->root->head.next)) 2675 &anon_vma->root->rb_root.rb_node))
2658 BUG(); 2676 BUG();
2659 anon_vma_unlock(anon_vma); 2677 anon_vma_unlock(anon_vma);
2660 } 2678 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..8a5ac8c686b0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/srcu.h>
17#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20 21
22/* global SRCU for all MMs */
23static struct srcu_struct srcu;
24
21/* 25/*
22 * This function can't run concurrently against mmu_notifier_register 26 * This function can't run concurrently against mmu_notifier_register
23 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap 27 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
25 * in parallel despite there being no task using this mm any more, 29 * in parallel despite there being no task using this mm any more,
26 * through the vmas outside of the exit_mmap context, such as with 30 * through the vmas outside of the exit_mmap context, such as with
27 * vmtruncate. This serializes against mmu_notifier_unregister with 31 * vmtruncate. This serializes against mmu_notifier_unregister with
28 * the mmu_notifier_mm->lock in addition to RCU and it serializes 32 * the mmu_notifier_mm->lock in addition to SRCU and it serializes
29 * against the other mmu notifiers with RCU. struct mmu_notifier_mm 33 * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
30 * can't go away from under us as exit_mmap holds an mm_count pin 34 * can't go away from under us as exit_mmap holds an mm_count pin
31 * itself. 35 * itself.
32 */ 36 */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
34{ 38{
35 struct mmu_notifier *mn; 39 struct mmu_notifier *mn;
36 struct hlist_node *n; 40 struct hlist_node *n;
41 int id;
37 42
38 /* 43 /*
39 * RCU here will block mmu_notifier_unregister until 44 * SRCU here will block mmu_notifier_unregister until
40 * ->release returns. 45 * ->release returns.
41 */ 46 */
42 rcu_read_lock(); 47 id = srcu_read_lock(&srcu);
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) 48 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /* 49 /*
45 * if ->release runs before mmu_notifier_unregister it 50 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
50 */ 55 */
51 if (mn->ops->release) 56 if (mn->ops->release)
52 mn->ops->release(mn, mm); 57 mn->ops->release(mn, mm);
53 rcu_read_unlock(); 58 srcu_read_unlock(&srcu, id);
54 59
55 spin_lock(&mm->mmu_notifier_mm->lock); 60 spin_lock(&mm->mmu_notifier_mm->lock);
56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 61 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
68 spin_unlock(&mm->mmu_notifier_mm->lock); 73 spin_unlock(&mm->mmu_notifier_mm->lock);
69 74
70 /* 75 /*
71 * synchronize_rcu here prevents mmu_notifier_release to 76 * synchronize_srcu here prevents mmu_notifier_release to
72 * return to exit_mmap (which would proceed freeing all pages 77 * return to exit_mmap (which would proceed freeing all pages
73 * in the mm) until the ->release method returns, if it was 78 * in the mm) until the ->release method returns, if it was
74 * invoked by mmu_notifier_unregister. 79 * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
76 * The mmu_notifier_mm can't go away from under us because one 81 * The mmu_notifier_mm can't go away from under us because one
77 * mm_count is hold by exit_mmap. 82 * mm_count is hold by exit_mmap.
78 */ 83 */
79 synchronize_rcu(); 84 synchronize_srcu(&srcu);
80} 85}
81 86
82/* 87/*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
89{ 94{
90 struct mmu_notifier *mn; 95 struct mmu_notifier *mn;
91 struct hlist_node *n; 96 struct hlist_node *n;
92 int young = 0; 97 int young = 0, id;
93 98
94 rcu_read_lock(); 99 id = srcu_read_lock(&srcu);
95 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 100 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
96 if (mn->ops->clear_flush_young) 101 if (mn->ops->clear_flush_young)
97 young |= mn->ops->clear_flush_young(mn, mm, address); 102 young |= mn->ops->clear_flush_young(mn, mm, address);
98 } 103 }
99 rcu_read_unlock(); 104 srcu_read_unlock(&srcu, id);
100 105
101 return young; 106 return young;
102} 107}
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
106{ 111{
107 struct mmu_notifier *mn; 112 struct mmu_notifier *mn;
108 struct hlist_node *n; 113 struct hlist_node *n;
109 int young = 0; 114 int young = 0, id;
110 115
111 rcu_read_lock(); 116 id = srcu_read_lock(&srcu);
112 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 117 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
113 if (mn->ops->test_young) { 118 if (mn->ops->test_young) {
114 young = mn->ops->test_young(mn, mm, address); 119 young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
116 break; 121 break;
117 } 122 }
118 } 123 }
119 rcu_read_unlock(); 124 srcu_read_unlock(&srcu, id);
120 125
121 return young; 126 return young;
122} 127}
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
126{ 131{
127 struct mmu_notifier *mn; 132 struct mmu_notifier *mn;
128 struct hlist_node *n; 133 struct hlist_node *n;
134 int id;
129 135
130 rcu_read_lock(); 136 id = srcu_read_lock(&srcu);
131 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
132 if (mn->ops->change_pte) 138 if (mn->ops->change_pte)
133 mn->ops->change_pte(mn, mm, address, pte); 139 mn->ops->change_pte(mn, mm, address, pte);
134 /*
135 * Some drivers don't have change_pte,
136 * so we must call invalidate_page in that case.
137 */
138 else if (mn->ops->invalidate_page)
139 mn->ops->invalidate_page(mn, mm, address);
140 } 140 }
141 rcu_read_unlock(); 141 srcu_read_unlock(&srcu, id);
142} 142}
143 143
144void __mmu_notifier_invalidate_page(struct mm_struct *mm, 144void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
146{ 146{
147 struct mmu_notifier *mn; 147 struct mmu_notifier *mn;
148 struct hlist_node *n; 148 struct hlist_node *n;
149 int id;
149 150
150 rcu_read_lock(); 151 id = srcu_read_lock(&srcu);
151 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 152 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
152 if (mn->ops->invalidate_page) 153 if (mn->ops->invalidate_page)
153 mn->ops->invalidate_page(mn, mm, address); 154 mn->ops->invalidate_page(mn, mm, address);
154 } 155 }
155 rcu_read_unlock(); 156 srcu_read_unlock(&srcu, id);
156} 157}
157 158
158void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, 159void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
160{ 161{
161 struct mmu_notifier *mn; 162 struct mmu_notifier *mn;
162 struct hlist_node *n; 163 struct hlist_node *n;
164 int id;
163 165
164 rcu_read_lock(); 166 id = srcu_read_lock(&srcu);
165 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 167 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
166 if (mn->ops->invalidate_range_start) 168 if (mn->ops->invalidate_range_start)
167 mn->ops->invalidate_range_start(mn, mm, start, end); 169 mn->ops->invalidate_range_start(mn, mm, start, end);
168 } 170 }
169 rcu_read_unlock(); 171 srcu_read_unlock(&srcu, id);
170} 172}
171 173
172void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 174void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
174{ 176{
175 struct mmu_notifier *mn; 177 struct mmu_notifier *mn;
176 struct hlist_node *n; 178 struct hlist_node *n;
179 int id;
177 180
178 rcu_read_lock(); 181 id = srcu_read_lock(&srcu);
179 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 182 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
180 if (mn->ops->invalidate_range_end) 183 if (mn->ops->invalidate_range_end)
181 mn->ops->invalidate_range_end(mn, mm, start, end); 184 mn->ops->invalidate_range_end(mn, mm, start, end);
182 } 185 }
183 rcu_read_unlock(); 186 srcu_read_unlock(&srcu, id);
184} 187}
185 188
186static int do_mmu_notifier_register(struct mmu_notifier *mn, 189static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +195,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
192 195
193 BUG_ON(atomic_read(&mm->mm_users) <= 0); 196 BUG_ON(atomic_read(&mm->mm_users) <= 0);
194 197
198 /*
199 * Verify that mmu_notifier_init() already run and the global srcu is
200 * initialized.
201 */
202 BUG_ON(!srcu.per_cpu_ref);
203
195 ret = -ENOMEM; 204 ret = -ENOMEM;
196 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 205 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
197 if (unlikely(!mmu_notifier_mm)) 206 if (unlikely(!mmu_notifier_mm))
@@ -201,11 +210,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
201 down_write(&mm->mmap_sem); 210 down_write(&mm->mmap_sem);
202 ret = mm_take_all_locks(mm); 211 ret = mm_take_all_locks(mm);
203 if (unlikely(ret)) 212 if (unlikely(ret))
204 goto out_cleanup; 213 goto out_clean;
205 214
206 if (!mm_has_notifiers(mm)) { 215 if (!mm_has_notifiers(mm)) {
207 INIT_HLIST_HEAD(&mmu_notifier_mm->list); 216 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
208 spin_lock_init(&mmu_notifier_mm->lock); 217 spin_lock_init(&mmu_notifier_mm->lock);
218
209 mm->mmu_notifier_mm = mmu_notifier_mm; 219 mm->mmu_notifier_mm = mmu_notifier_mm;
210 mmu_notifier_mm = NULL; 220 mmu_notifier_mm = NULL;
211 } 221 }
@@ -224,10 +234,9 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
224 spin_unlock(&mm->mmu_notifier_mm->lock); 234 spin_unlock(&mm->mmu_notifier_mm->lock);
225 235
226 mm_drop_all_locks(mm); 236 mm_drop_all_locks(mm);
227out_cleanup: 237out_clean:
228 if (take_mmap_sem) 238 if (take_mmap_sem)
229 up_write(&mm->mmap_sem); 239 up_write(&mm->mmap_sem);
230 /* kfree() does nothing if mmu_notifier_mm is NULL */
231 kfree(mmu_notifier_mm); 240 kfree(mmu_notifier_mm);
232out: 241out:
233 BUG_ON(atomic_read(&mm->mm_users) <= 0); 242 BUG_ON(atomic_read(&mm->mm_users) <= 0);
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
274/* 283/*
275 * This releases the mm_count pin automatically and frees the mm 284 * This releases the mm_count pin automatically and frees the mm
276 * structure if it was the last user of it. It serializes against 285 * structure if it was the last user of it. It serializes against
277 * running mmu notifiers with RCU and against mmu_notifier_unregister 286 * running mmu notifiers with SRCU and against mmu_notifier_unregister
278 * with the unregister lock + RCU. All sptes must be dropped before 287 * with the unregister lock + SRCU. All sptes must be dropped before
279 * calling mmu_notifier_unregister. ->release or any other notifier 288 * calling mmu_notifier_unregister. ->release or any other notifier
280 * method may be invoked concurrently with mmu_notifier_unregister, 289 * method may be invoked concurrently with mmu_notifier_unregister,
281 * and only after mmu_notifier_unregister returned we're guaranteed 290 * and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
287 296
288 if (!hlist_unhashed(&mn->hlist)) { 297 if (!hlist_unhashed(&mn->hlist)) {
289 /* 298 /*
290 * RCU here will force exit_mmap to wait ->release to finish 299 * SRCU here will force exit_mmap to wait ->release to finish
291 * before freeing the pages. 300 * before freeing the pages.
292 */ 301 */
293 rcu_read_lock(); 302 int id;
294 303
304 id = srcu_read_lock(&srcu);
295 /* 305 /*
296 * exit_mmap will block in mmu_notifier_release to 306 * exit_mmap will block in mmu_notifier_release to
297 * guarantee ->release is called before freeing the 307 * guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
299 */ 309 */
300 if (mn->ops->release) 310 if (mn->ops->release)
301 mn->ops->release(mn, mm); 311 mn->ops->release(mn, mm);
302 rcu_read_unlock(); 312 srcu_read_unlock(&srcu, id);
303 313
304 spin_lock(&mm->mmu_notifier_mm->lock); 314 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist); 315 hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
310 * Wait any running method to finish, of course including 320 * Wait any running method to finish, of course including
311 * ->release if it was run by mmu_notifier_relase instead of us. 321 * ->release if it was run by mmu_notifier_relase instead of us.
312 */ 322 */
313 synchronize_rcu(); 323 synchronize_srcu(&srcu);
314 324
315 BUG_ON(atomic_read(&mm->mm_count) <= 0); 325 BUG_ON(atomic_read(&mm->mm_count) <= 0);
316 326
317 mmdrop(mm); 327 mmdrop(mm);
318} 328}
319EXPORT_SYMBOL_GPL(mmu_notifier_unregister); 329EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
330
331static int __init mmu_notifier_init(void)
332{
333 return init_srcu_struct(&srcu);
334}
335
336module_init(mmu_notifier_init);
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..1b61c2d3307a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, 71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
72 unsigned long old_addr, unsigned long old_end, 72 unsigned long old_addr, unsigned long old_end,
73 struct vm_area_struct *new_vma, pmd_t *new_pmd, 73 struct vm_area_struct *new_vma, pmd_t *new_pmd,
74 unsigned long new_addr) 74 unsigned long new_addr, bool need_rmap_locks)
75{ 75{
76 struct address_space *mapping = NULL; 76 struct address_space *mapping = NULL;
77 struct anon_vma *anon_vma = NULL;
77 struct mm_struct *mm = vma->vm_mm; 78 struct mm_struct *mm = vma->vm_mm;
78 pte_t *old_pte, *new_pte, pte; 79 pte_t *old_pte, *new_pte, pte;
79 spinlock_t *old_ptl, *new_ptl; 80 spinlock_t *old_ptl, *new_ptl;
80 81
81 if (vma->vm_file) { 82 /*
82 /* 83 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
83 * Subtle point from Rajesh Venkatasubramanian: before 84 * locks to ensure that rmap will always observe either the old or the
84 * moving file-based ptes, we must lock truncate_pagecache 85 * new ptes. This is the easiest way to avoid races with
85 * out, since it might clean the dst vma before the src vma, 86 * truncate_pagecache(), page migration, etc...
86 * and we propagate stale pages into the dst afterward. 87 *
87 */ 88 * When need_rmap_locks is false, we use other ways to avoid
88 mapping = vma->vm_file->f_mapping; 89 * such races:
89 mutex_lock(&mapping->i_mmap_mutex); 90 *
91 * - During exec() shift_arg_pages(), we use a specially tagged vma
92 * which rmap call sites look for using is_vma_temporary_stack().
93 *
94 * - During mremap(), new_vma is often known to be placed after vma
95 * in rmap traversal order. This ensures rmap will always observe
96 * either the old pte, or the new pte, or both (the page table locks
97 * serialize access to individual ptes, but only rmap traversal
98 * order guarantees that we won't miss both the old and new ptes).
99 */
100 if (need_rmap_locks) {
101 if (vma->vm_file) {
102 mapping = vma->vm_file->f_mapping;
103 mutex_lock(&mapping->i_mmap_mutex);
104 }
105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma);
108 }
90 } 109 }
91 110
92 /* 111 /*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
114 spin_unlock(new_ptl); 133 spin_unlock(new_ptl);
115 pte_unmap(new_pte - 1); 134 pte_unmap(new_pte - 1);
116 pte_unmap_unlock(old_pte - 1, old_ptl); 135 pte_unmap_unlock(old_pte - 1, old_ptl);
136 if (anon_vma)
137 anon_vma_unlock(anon_vma);
117 if (mapping) 138 if (mapping)
118 mutex_unlock(&mapping->i_mmap_mutex); 139 mutex_unlock(&mapping->i_mmap_mutex);
119} 140}
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
122 143
123unsigned long move_page_tables(struct vm_area_struct *vma, 144unsigned long move_page_tables(struct vm_area_struct *vma,
124 unsigned long old_addr, struct vm_area_struct *new_vma, 145 unsigned long old_addr, struct vm_area_struct *new_vma,
125 unsigned long new_addr, unsigned long len) 146 unsigned long new_addr, unsigned long len,
147 bool need_rmap_locks)
126{ 148{
127 unsigned long extent, next, old_end; 149 unsigned long extent, next, old_end;
128 pmd_t *old_pmd, *new_pmd; 150 pmd_t *old_pmd, *new_pmd;
129 bool need_flush = false; 151 bool need_flush = false;
152 unsigned long mmun_start; /* For mmu_notifiers */
153 unsigned long mmun_end; /* For mmu_notifiers */
130 154
131 old_end = old_addr + len; 155 old_end = old_addr + len;
132 flush_cache_range(vma, old_addr, old_end); 156 flush_cache_range(vma, old_addr, old_end);
133 157
134 mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); 158 mmun_start = old_addr;
159 mmun_end = old_end;
160 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
135 161
136 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 162 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
137 cond_resched(); 163 cond_resched();
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
169 if (extent > LATENCY_LIMIT) 195 if (extent > LATENCY_LIMIT)
170 extent = LATENCY_LIMIT; 196 extent = LATENCY_LIMIT;
171 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 197 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
172 new_vma, new_pmd, new_addr); 198 new_vma, new_pmd, new_addr, need_rmap_locks);
173 need_flush = true; 199 need_flush = true;
174 } 200 }
175 if (likely(need_flush)) 201 if (likely(need_flush))
176 flush_tlb_range(vma, old_end-len, old_addr); 202 flush_tlb_range(vma, old_end-len, old_addr);
177 203
178 mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); 204 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
179 205
180 return len + old_addr - old_end; /* how much done */ 206 return len + old_addr - old_end; /* how much done */
181} 207}
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
193 unsigned long hiwater_vm; 219 unsigned long hiwater_vm;
194 int split = 0; 220 int split = 0;
195 int err; 221 int err;
222 bool need_rmap_locks;
196 223
197 /* 224 /*
198 * We'd prefer to avoid failure later on in do_munmap: 225 * We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
214 return err; 241 return err;
215 242
216 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 243 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
217 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 244 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
245 &need_rmap_locks);
218 if (!new_vma) 246 if (!new_vma)
219 return -ENOMEM; 247 return -ENOMEM;
220 248
221 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); 249 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
250 need_rmap_locks);
222 if (moved_len < old_len) { 251 if (moved_len < old_len) {
223 /* 252 /*
224 * Before moving the page tables from the new vma to
225 * the old vma, we need to be sure the old vma is
226 * queued after new vma in the same_anon_vma list to
227 * prevent SMP races with rmap_walk (that could lead
228 * rmap_walk to miss some page table).
229 */
230 anon_vma_moveto_tail(vma);
231
232 /*
233 * On error, move entries back from new area to old, 253 * On error, move entries back from new area to old,
234 * which will succeed since page tables still there, 254 * which will succeed since page tables still there,
235 * and then proceed to unmap new area instead of old. 255 * and then proceed to unmap new area instead of old.
236 */ 256 */
237 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); 257 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
258 true);
238 vma = new_vma; 259 vma = new_vma;
239 old_len = new_len; 260 old_len = new_len;
240 old_addr = new_addr; 261 old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..714d5d650470 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
116 return 0; 116 return 0;
117 117
118 __free_pages_memory(start_pfn, end_pfn); 118 __free_pages_memory(start_pfn, end_pfn);
119 fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
120 start_pfn, end_pfn);
119 121
120 return end_pfn - start_pfn; 122 return end_pfn - start_pfn;
121} 123}
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
126 phys_addr_t start, end, size; 128 phys_addr_t start, end, size;
127 u64 i; 129 u64 i;
128 130
131 reset_zone_present_pages();
129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 132 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
130 count += __free_memory_core(start, end); 133 count += __free_memory_core(start, end);
131 134
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void)
162 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 165 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
163 * because in some case like Node0 doesn't have RAM installed 166 * because in some case like Node0 doesn't have RAM installed
164 * low ram will be on Node1 167 * low ram will be on Node1
165 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
166 * will be used instead of only Node0 related
167 */ 168 */
168 return free_low_memory_core_early(MAX_NUMNODES); 169 return free_low_memory_core_early(MAX_NUMNODES);
169} 170}
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872de..45131b41bcdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
698 698
699 mutex_lock(&mapping->i_mmap_mutex); 699 mutex_lock(&mapping->i_mmap_mutex);
700 flush_dcache_mmap_lock(mapping); 700 flush_dcache_mmap_lock(mapping);
701 vma_prio_tree_insert(vma, &mapping->i_mmap); 701 vma_interval_tree_insert(vma, &mapping->i_mmap);
702 flush_dcache_mmap_unlock(mapping); 702 flush_dcache_mmap_unlock(mapping);
703 mutex_unlock(&mapping->i_mmap_mutex); 703 mutex_unlock(&mapping->i_mmap_mutex);
704 } 704 }
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
764 764
765 mutex_lock(&mapping->i_mmap_mutex); 765 mutex_lock(&mapping->i_mmap_mutex);
766 flush_dcache_mmap_lock(mapping); 766 flush_dcache_mmap_lock(mapping);
767 vma_prio_tree_remove(vma, &mapping->i_mmap); 767 vma_interval_tree_remove(vma, &mapping->i_mmap);
768 flush_dcache_mmap_unlock(mapping); 768 flush_dcache_mmap_unlock(mapping);
769 mutex_unlock(&mapping->i_mmap_mutex); 769 mutex_unlock(&mapping->i_mmap_mutex);
770 } 770 }
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
789 kenter("%p", vma); 789 kenter("%p", vma);
790 if (vma->vm_ops && vma->vm_ops->close) 790 if (vma->vm_ops && vma->vm_ops->close)
791 vma->vm_ops->close(vma); 791 vma->vm_ops->close(vma);
792 if (vma->vm_file) { 792 if (vma->vm_file)
793 fput(vma->vm_file); 793 fput(vma->vm_file);
794 if (vma->vm_flags & VM_EXECUTABLE)
795 removed_exe_file_vma(mm);
796 }
797 put_nommu_region(vma->vm_region); 794 put_nommu_region(vma->vm_region);
798 kmem_cache_free(vm_area_cachep, vma); 795 kmem_cache_free(vm_area_cachep, vma);
799} 796}
@@ -1282,14 +1279,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1282 vma->vm_pgoff = pgoff; 1279 vma->vm_pgoff = pgoff;
1283 1280
1284 if (file) { 1281 if (file) {
1285 region->vm_file = file; 1282 region->vm_file = get_file(file);
1286 get_file(file); 1283 vma->vm_file = get_file(file);
1287 vma->vm_file = file;
1288 get_file(file);
1289 if (vm_flags & VM_EXECUTABLE) {
1290 added_exe_file_vma(current->mm);
1291 vma->vm_mm = current->mm;
1292 }
1293 } 1284 }
1294 1285
1295 down_write(&nommu_region_sem); 1286 down_write(&nommu_region_sem);
@@ -1442,8 +1433,6 @@ error:
1442 kmem_cache_free(vm_region_jar, region); 1433 kmem_cache_free(vm_region_jar, region);
1443 if (vma->vm_file) 1434 if (vma->vm_file)
1444 fput(vma->vm_file); 1435 fput(vma->vm_file);
1445 if (vma->vm_flags & VM_EXECUTABLE)
1446 removed_exe_file_vma(vma->vm_mm);
1447 kmem_cache_free(vm_area_cachep, vma); 1436 kmem_cache_free(vm_area_cachep, vma);
1448 kleave(" = %d", ret); 1437 kleave(" = %d", ret);
1449 return ret; 1438 return ret;
@@ -1822,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1822 if (addr != (pfn << PAGE_SHIFT)) 1811 if (addr != (pfn << PAGE_SHIFT))
1823 return -EINVAL; 1812 return -EINVAL;
1824 1813
1825 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1814 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1826 return 0; 1815 return 0;
1827} 1816}
1828EXPORT_SYMBOL(remap_pfn_range); 1817EXPORT_SYMBOL(remap_pfn_range);
@@ -1963,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1963} 1952}
1964EXPORT_SYMBOL(filemap_fault); 1953EXPORT_SYMBOL(filemap_fault);
1965 1954
1955int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
1956 unsigned long size, pgoff_t pgoff)
1957{
1958 BUG();
1959 return 0;
1960}
1961EXPORT_SYMBOL(generic_file_remap_pages);
1962
1966static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 1963static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1967 unsigned long addr, void *buf, int len, int write) 1964 unsigned long addr, void *buf, int len, int write)
1968{ 1965{
@@ -2047,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2047 size_t newsize) 2044 size_t newsize)
2048{ 2045{
2049 struct vm_area_struct *vma; 2046 struct vm_area_struct *vma;
2050 struct prio_tree_iter iter;
2051 struct vm_region *region; 2047 struct vm_region *region;
2052 pgoff_t low, high; 2048 pgoff_t low, high;
2053 size_t r_size, r_top; 2049 size_t r_size, r_top;
@@ -2059,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2059 mutex_lock(&inode->i_mapping->i_mmap_mutex); 2055 mutex_lock(&inode->i_mapping->i_mmap_mutex);
2060 2056
2061 /* search for VMAs that fall within the dead zone */ 2057 /* search for VMAs that fall within the dead zone */
2062 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2058 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
2063 low, high) {
2064 /* found one - only interested if it's shared out of the page 2059 /* found one - only interested if it's shared out of the page
2065 * cache */ 2060 * cache */
2066 if (vma->vm_flags & VM_SHARED) { 2061 if (vma->vm_flags & VM_SHARED) {
@@ -2076,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2076 * we don't check for any regions that start beyond the EOF as there 2071 * we don't check for any regions that start beyond the EOF as there
2077 * shouldn't be any 2072 * shouldn't be any
2078 */ 2073 */
2079 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2074 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
2080 0, ULONG_MAX) { 2075 0, ULONG_MAX) {
2081 if (!(vma->vm_flags & VM_SHARED)) 2076 if (!(vma->vm_flags & VM_SHARED))
2082 continue; 2077 continue;
2083 2078
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..79e0f3e24831 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 428{
429 task_lock(current); 429 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_adj=%d, oom_score_adj=%d\n", 431 "oom_score_adj=%d\n",
432 current->comm, gfp_mask, order, current->signal->oom_adj, 432 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 433 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 434 cpuset_print_task_mems_allowed(current);
435 task_unlock(current); 435 task_unlock(current);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5ad5ce23c1e0..830893b2b3c7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1602,10 +1602,18 @@ void writeback_set_ratelimit(void)
1602} 1602}
1603 1603
1604static int __cpuinit 1604static int __cpuinit
1605ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) 1605ratelimit_handler(struct notifier_block *self, unsigned long action,
1606 void *hcpu)
1606{ 1607{
1607 writeback_set_ratelimit(); 1608
1608 return NOTIFY_DONE; 1609 switch (action & ~CPU_TASKS_FROZEN) {
1610 case CPU_ONLINE:
1611 case CPU_DEAD:
1612 writeback_set_ratelimit();
1613 return NOTIFY_OK;
1614 default:
1615 return NOTIFY_DONE;
1616 }
1609} 1617}
1610 1618
1611static struct notifier_block __cpuinitdata ratelimit_nb = { 1619static struct notifier_block __cpuinitdata ratelimit_nb = {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..5b74de6702e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
558 if (page_is_guard(buddy)) { 558 if (page_is_guard(buddy)) {
559 clear_page_guard_flag(buddy); 559 clear_page_guard_flag(buddy);
560 set_page_private(page, 0); 560 set_page_private(page, 0);
561 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 561 __mod_zone_freepage_state(zone, 1 << order,
562 migratetype);
562 } else { 563 } else {
563 list_del(&buddy->lru); 564 list_del(&buddy->lru);
564 zone->free_area[order].nr_free--; 565 zone->free_area[order].nr_free--;
@@ -597,17 +598,6 @@ out:
597 zone->free_area[order].nr_free++; 598 zone->free_area[order].nr_free++;
598} 599}
599 600
600/*
601 * free_page_mlock() -- clean up attempts to free and mlocked() page.
602 * Page should not be on lru, so no need to fix that up.
603 * free_pages_check() will verify...
604 */
605static inline void free_page_mlock(struct page *page)
606{
607 __dec_zone_page_state(page, NR_MLOCK);
608 __count_vm_event(UNEVICTABLE_MLOCKFREED);
609}
610
611static inline int free_pages_check(struct page *page) 601static inline int free_pages_check(struct page *page)
612{ 602{
613 if (unlikely(page_mapcount(page) | 603 if (unlikely(page_mapcount(page) |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count,
668 batch_free = to_free; 658 batch_free = to_free;
669 659
670 do { 660 do {
661 int mt; /* migratetype of the to-be-freed page */
662
671 page = list_entry(list->prev, struct page, lru); 663 page = list_entry(list->prev, struct page, lru);
672 /* must delete as __free_one_page list manipulates */ 664 /* must delete as __free_one_page list manipulates */
673 list_del(&page->lru); 665 list_del(&page->lru);
666 mt = get_freepage_migratetype(page);
674 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
675 __free_one_page(page, zone, 0, page_private(page)); 668 __free_one_page(page, zone, 0, mt);
676 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 669 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt))
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
677 } while (--to_free && --batch_free && !list_empty(list)); 672 } while (--to_free && --batch_free && !list_empty(list));
678 } 673 }
679 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
688 zone->pages_scanned = 0; 683 zone->pages_scanned = 0;
689 684
690 __free_one_page(page, zone, order, migratetype); 685 __free_one_page(page, zone, order, migratetype);
691 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 686 if (unlikely(migratetype != MIGRATE_ISOLATE))
687 __mod_zone_freepage_state(zone, 1 << order, migratetype);
692 spin_unlock(&zone->lock); 688 spin_unlock(&zone->lock);
693} 689}
694 690
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
721static void __free_pages_ok(struct page *page, unsigned int order) 717static void __free_pages_ok(struct page *page, unsigned int order)
722{ 718{
723 unsigned long flags; 719 unsigned long flags;
724 int wasMlocked = __TestClearPageMlocked(page); 720 int migratetype;
725 721
726 if (!free_pages_prepare(page, order)) 722 if (!free_pages_prepare(page, order))
727 return; 723 return;
728 724
729 local_irq_save(flags); 725 local_irq_save(flags);
730 if (unlikely(wasMlocked))
731 free_page_mlock(page);
732 __count_vm_events(PGFREE, 1 << order); 726 __count_vm_events(PGFREE, 1 << order);
733 free_one_page(page_zone(page), page, order, 727 migratetype = get_pageblock_migratetype(page);
734 get_pageblock_migratetype(page)); 728 set_freepage_migratetype(page, migratetype);
729 free_one_page(page_zone(page), page, order, migratetype);
735 local_irq_restore(flags); 730 local_irq_restore(flags);
736} 731}
737 732
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page,
811 set_page_guard_flag(&page[size]); 806 set_page_guard_flag(&page[size]);
812 set_page_private(&page[size], high); 807 set_page_private(&page[size], high);
813 /* Guard pages are not available for any usage */ 808 /* Guard pages are not available for any usage */
814 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); 809 __mod_zone_freepage_state(zone, -(1 << high),
810 migratetype);
815 continue; 811 continue;
816 } 812 }
817#endif 813#endif
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
915 * Note that start_page and end_pages are not aligned on a pageblock 911 * Note that start_page and end_pages are not aligned on a pageblock
916 * boundary. If alignment is required, use move_freepages_block() 912 * boundary. If alignment is required, use move_freepages_block()
917 */ 913 */
918static int move_freepages(struct zone *zone, 914int move_freepages(struct zone *zone,
919 struct page *start_page, struct page *end_page, 915 struct page *start_page, struct page *end_page,
920 int migratetype) 916 int migratetype)
921{ 917{
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone,
951 order = page_order(page); 947 order = page_order(page);
952 list_move(&page->lru, 948 list_move(&page->lru,
953 &zone->free_area[order].free_list[migratetype]); 949 &zone->free_area[order].free_list[migratetype]);
950 set_freepage_migratetype(page, migratetype);
954 page += 1 << order; 951 page += 1 << order;
955 pages_moved += 1 << order; 952 pages_moved += 1 << order;
956 } 953 }
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1135 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1132 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1136 mt = migratetype; 1133 mt = migratetype;
1137 } 1134 }
1138 set_page_private(page, mt); 1135 set_freepage_migratetype(page, mt);
1139 list = &page->lru; 1136 list = &page->lru;
1137 if (is_migrate_cma(mt))
1138 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1139 -(1 << order));
1140 } 1140 }
1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1142 spin_unlock(&zone->lock); 1142 spin_unlock(&zone->lock);
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold)
1296 struct per_cpu_pages *pcp; 1296 struct per_cpu_pages *pcp;
1297 unsigned long flags; 1297 unsigned long flags;
1298 int migratetype; 1298 int migratetype;
1299 int wasMlocked = __TestClearPageMlocked(page);
1300 1299
1301 if (!free_pages_prepare(page, 0)) 1300 if (!free_pages_prepare(page, 0))
1302 return; 1301 return;
1303 1302
1304 migratetype = get_pageblock_migratetype(page); 1303 migratetype = get_pageblock_migratetype(page);
1305 set_page_private(page, migratetype); 1304 set_freepage_migratetype(page, migratetype);
1306 local_irq_save(flags); 1305 local_irq_save(flags);
1307 if (unlikely(wasMlocked))
1308 free_page_mlock(page);
1309 __count_vm_event(PGFREE); 1306 __count_vm_event(PGFREE);
1310 1307
1311 /* 1308 /*
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order)
1380} 1377}
1381 1378
1382/* 1379/*
1383 * Similar to split_page except the page is already free. As this is only 1380 * Similar to the split_page family of functions except that the page
1384 * being used for migration, the migratetype of the block also changes. 1381 * required at the given order and being isolated now to prevent races
1385 * As this is called with interrupts disabled, the caller is responsible 1382 * with parallel allocators
1386 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1387 * are enabled.
1388 *
1389 * Note: this is probably too low level an operation for use in drivers.
1390 * Please consult with lkml before using this in your driver.
1391 */ 1383 */
1392int split_free_page(struct page *page) 1384int capture_free_page(struct page *page, int alloc_order, int migratetype)
1393{ 1385{
1394 unsigned int order; 1386 unsigned int order;
1395 unsigned long watermark; 1387 unsigned long watermark;
1396 struct zone *zone; 1388 struct zone *zone;
1389 int mt;
1397 1390
1398 BUG_ON(!PageBuddy(page)); 1391 BUG_ON(!PageBuddy(page));
1399 1392
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page)
1409 list_del(&page->lru); 1402 list_del(&page->lru);
1410 zone->free_area[order].nr_free--; 1403 zone->free_area[order].nr_free--;
1411 rmv_page_order(page); 1404 rmv_page_order(page);
1412 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1413 1405
1414 /* Split into individual pages */ 1406 mt = get_pageblock_migratetype(page);
1415 set_page_refcounted(page); 1407 if (unlikely(mt != MIGRATE_ISOLATE))
1416 split_page(page, order); 1408 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1417 1409
1410 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype);
1413
1414 /* Set the pageblock if the captured page is at least a pageblock */
1418 if (order >= pageblock_order - 1) { 1415 if (order >= pageblock_order - 1) {
1419 struct page *endpage = page + (1 << order) - 1; 1416 struct page *endpage = page + (1 << order) - 1;
1420 for (; page < endpage; page += pageblock_nr_pages) { 1417 for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page)
1425 } 1422 }
1426 } 1423 }
1427 1424
1428 return 1 << order; 1425 return 1UL << order;
1426}
1427
1428/*
1429 * Similar to split_page except the page is already free. As this is only
1430 * being used for migration, the migratetype of the block also changes.
1431 * As this is called with interrupts disabled, the caller is responsible
1432 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1433 * are enabled.
1434 *
1435 * Note: this is probably too low level an operation for use in drivers.
1436 * Please consult with lkml before using this in your driver.
1437 */
1438int split_free_page(struct page *page)
1439{
1440 unsigned int order;
1441 int nr_pages;
1442
1443 BUG_ON(!PageBuddy(page));
1444 order = page_order(page);
1445
1446 nr_pages = capture_free_page(page, order, 0);
1447 if (!nr_pages)
1448 return 0;
1449
1450 /* Split into individual pages */
1451 set_page_refcounted(page);
1452 split_page(page, order);
1453 return nr_pages;
1429} 1454}
1430 1455
1431/* 1456/*
@@ -1484,7 +1509,8 @@ again:
1484 spin_unlock(&zone->lock); 1509 spin_unlock(&zone->lock);
1485 if (!page) 1510 if (!page)
1486 goto failed; 1511 goto failed;
1487 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); 1512 __mod_zone_freepage_state(zone, -(1 << order),
1513 get_pageblock_migratetype(page));
1488 } 1514 }
1489 1515
1490 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1516 __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1527,6 @@ failed:
1501 return NULL; 1527 return NULL;
1502} 1528}
1503 1529
1504/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1505#define ALLOC_WMARK_MIN WMARK_MIN
1506#define ALLOC_WMARK_LOW WMARK_LOW
1507#define ALLOC_WMARK_HIGH WMARK_HIGH
1508#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1509
1510/* Mask to get the watermark bits */
1511#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1512
1513#define ALLOC_HARDER 0x10 /* try to alloc harder */
1514#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1515#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1516
1517#ifdef CONFIG_FAIL_PAGE_ALLOC 1530#ifdef CONFIG_FAIL_PAGE_ALLOC
1518 1531
1519static struct { 1532static struct {
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1608 min -= min / 2; 1621 min -= min / 2;
1609 if (alloc_flags & ALLOC_HARDER) 1622 if (alloc_flags & ALLOC_HARDER)
1610 min -= min / 4; 1623 min -= min / 4;
1611 1624#ifdef CONFIG_CMA
1625 /* If allocation can't use CMA areas don't use free CMA pages */
1626 if (!(alloc_flags & ALLOC_CMA))
1627 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
1628#endif
1612 if (free_pages <= min + lowmem_reserve) 1629 if (free_pages <= min + lowmem_reserve)
1613 return false; 1630 return false;
1614 for (o = 0; o < order; o++) { 1631 for (o = 0; o < order; o++) {
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1782 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1799 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1783} 1800}
1784 1801
1802static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1803{
1804 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1805}
1806
1807static void __paginginit init_zone_allows_reclaim(int nid)
1808{
1809 int i;
1810
1811 for_each_online_node(i)
1812 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1813 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1814 else
1815 zone_reclaim_mode = 1;
1816}
1817
1785#else /* CONFIG_NUMA */ 1818#else /* CONFIG_NUMA */
1786 1819
1787static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1820static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1802static void zlc_clear_zones_full(struct zonelist *zonelist) 1835static void zlc_clear_zones_full(struct zonelist *zonelist)
1803{ 1836{
1804} 1837}
1838
1839static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1840{
1841 return true;
1842}
1843
1844static inline void init_zone_allows_reclaim(int nid)
1845{
1846}
1805#endif /* CONFIG_NUMA */ 1847#endif /* CONFIG_NUMA */
1806 1848
1807/* 1849/*
@@ -1886,7 +1928,8 @@ zonelist_scan:
1886 did_zlc_setup = 1; 1928 did_zlc_setup = 1;
1887 } 1929 }
1888 1930
1889 if (zone_reclaim_mode == 0) 1931 if (zone_reclaim_mode == 0 ||
1932 !zone_allows_reclaim(preferred_zone, zone))
1890 goto this_zone_full; 1933 goto this_zone_full;
1891 1934
1892 /* 1935 /*
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2105 bool *contended_compaction, bool *deferred_compaction, 2148 bool *contended_compaction, bool *deferred_compaction,
2106 unsigned long *did_some_progress) 2149 unsigned long *did_some_progress)
2107{ 2150{
2108 struct page *page; 2151 struct page *page = NULL;
2109 2152
2110 if (!order) 2153 if (!order)
2111 return NULL; 2154 return NULL;
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2118 current->flags |= PF_MEMALLOC; 2161 current->flags |= PF_MEMALLOC;
2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2162 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2120 nodemask, sync_migration, 2163 nodemask, sync_migration,
2121 contended_compaction); 2164 contended_compaction, &page);
2122 current->flags &= ~PF_MEMALLOC; 2165 current->flags &= ~PF_MEMALLOC;
2123 if (*did_some_progress != COMPACT_SKIPPED) {
2124 2166
2167 /* If compaction captured a page, prep and use it */
2168 if (page) {
2169 prep_new_page(page, order, gfp_mask);
2170 goto got_page;
2171 }
2172
2173 if (*did_some_progress != COMPACT_SKIPPED) {
2125 /* Page migration frees to the PCP lists but we want merging */ 2174 /* Page migration frees to the PCP lists but we want merging */
2126 drain_pages(get_cpu()); 2175 drain_pages(get_cpu());
2127 put_cpu(); 2176 put_cpu();
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2131 alloc_flags & ~ALLOC_NO_WATERMARKS, 2180 alloc_flags & ~ALLOC_NO_WATERMARKS,
2132 preferred_zone, migratetype); 2181 preferred_zone, migratetype);
2133 if (page) { 2182 if (page) {
2183got_page:
2184 preferred_zone->compact_blockskip_flush = false;
2134 preferred_zone->compact_considered = 0; 2185 preferred_zone->compact_considered = 0;
2135 preferred_zone->compact_defer_shift = 0; 2186 preferred_zone->compact_defer_shift = 0;
2136 if (order >= preferred_zone->compact_order_failed) 2187 if (order >= preferred_zone->compact_order_failed)
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2315 unlikely(test_thread_flag(TIF_MEMDIE)))) 2366 unlikely(test_thread_flag(TIF_MEMDIE))))
2316 alloc_flags |= ALLOC_NO_WATERMARKS; 2367 alloc_flags |= ALLOC_NO_WATERMARKS;
2317 } 2368 }
2318 2369#ifdef CONFIG_CMA
2370 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2371 alloc_flags |= ALLOC_CMA;
2372#endif
2319 return alloc_flags; 2373 return alloc_flags;
2320} 2374}
2321 2375
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2362 goto nopage; 2416 goto nopage;
2363 2417
2364restart: 2418restart:
2365 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2419 wake_all_kswapd(order, zonelist, high_zoneidx,
2366 wake_all_kswapd(order, zonelist, high_zoneidx, 2420 zone_idx(preferred_zone));
2367 zone_idx(preferred_zone));
2368 2421
2369 /* 2422 /*
2370 * OK, we're below the kswapd watermark and have kicked background 2423 * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2494,7 @@ rebalance:
2441 * system then fail the allocation instead of entering direct reclaim. 2494 * system then fail the allocation instead of entering direct reclaim.
2442 */ 2495 */
2443 if ((deferred_compaction || contended_compaction) && 2496 if ((deferred_compaction || contended_compaction) &&
2444 (gfp_mask & __GFP_NO_KSWAPD)) 2497 (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
2445 goto nopage; 2498 goto nopage;
2446 2499
2447 /* Try direct reclaim and then allocating */ 2500 /* Try direct reclaim and then allocating */
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2541 struct page *page = NULL; 2594 struct page *page = NULL;
2542 int migratetype = allocflags_to_migratetype(gfp_mask); 2595 int migratetype = allocflags_to_migratetype(gfp_mask);
2543 unsigned int cpuset_mems_cookie; 2596 unsigned int cpuset_mems_cookie;
2597 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2544 2598
2545 gfp_mask &= gfp_allowed_mask; 2599 gfp_mask &= gfp_allowed_mask;
2546 2600
@@ -2569,9 +2623,13 @@ retry_cpuset:
2569 if (!preferred_zone) 2623 if (!preferred_zone)
2570 goto out; 2624 goto out;
2571 2625
2626#ifdef CONFIG_CMA
2627 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2628 alloc_flags |= ALLOC_CMA;
2629#endif
2572 /* First allocation attempt */ 2630 /* First allocation attempt */
2573 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2631 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2574 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, 2632 zonelist, high_zoneidx, alloc_flags,
2575 preferred_zone, migratetype); 2633 preferred_zone, migratetype);
2576 if (unlikely(!page)) 2634 if (unlikely(!page))
2577 page = __alloc_pages_slowpath(gfp_mask, order, 2635 page = __alloc_pages_slowpath(gfp_mask, order,
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter)
2852 " unevictable:%lu" 2910 " unevictable:%lu"
2853 " dirty:%lu writeback:%lu unstable:%lu\n" 2911 " dirty:%lu writeback:%lu unstable:%lu\n"
2854 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2912 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2855 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2913 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
2914 " free_cma:%lu\n",
2856 global_page_state(NR_ACTIVE_ANON), 2915 global_page_state(NR_ACTIVE_ANON),
2857 global_page_state(NR_INACTIVE_ANON), 2916 global_page_state(NR_INACTIVE_ANON),
2858 global_page_state(NR_ISOLATED_ANON), 2917 global_page_state(NR_ISOLATED_ANON),
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter)
2869 global_page_state(NR_FILE_MAPPED), 2928 global_page_state(NR_FILE_MAPPED),
2870 global_page_state(NR_SHMEM), 2929 global_page_state(NR_SHMEM),
2871 global_page_state(NR_PAGETABLE), 2930 global_page_state(NR_PAGETABLE),
2872 global_page_state(NR_BOUNCE)); 2931 global_page_state(NR_BOUNCE),
2932 global_page_state(NR_FREE_CMA_PAGES));
2873 2933
2874 for_each_populated_zone(zone) { 2934 for_each_populated_zone(zone) {
2875 int i; 2935 int i;
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter)
2901 " pagetables:%lukB" 2961 " pagetables:%lukB"
2902 " unstable:%lukB" 2962 " unstable:%lukB"
2903 " bounce:%lukB" 2963 " bounce:%lukB"
2964 " free_cma:%lukB"
2904 " writeback_tmp:%lukB" 2965 " writeback_tmp:%lukB"
2905 " pages_scanned:%lu" 2966 " pages_scanned:%lu"
2906 " all_unreclaimable? %s" 2967 " all_unreclaimable? %s"
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter)
2930 K(zone_page_state(zone, NR_PAGETABLE)), 2991 K(zone_page_state(zone, NR_PAGETABLE)),
2931 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 2992 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2932 K(zone_page_state(zone, NR_BOUNCE)), 2993 K(zone_page_state(zone, NR_BOUNCE)),
2994 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
2933 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2995 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2934 zone->pages_scanned, 2996 zone->pages_scanned,
2935 (zone->all_unreclaimable ? "yes" : "no") 2997 (zone->all_unreclaimable ? "yes" : "no")
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
3328 j = 0; 3390 j = 0;
3329 3391
3330 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3392 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3331 int distance = node_distance(local_node, node);
3332
3333 /*
3334 * If another node is sufficiently far away then it is better
3335 * to reclaim pages in a zone before going off node.
3336 */
3337 if (distance > RECLAIM_DISTANCE)
3338 zone_reclaim_mode = 1;
3339
3340 /* 3393 /*
3341 * We don't want to pressure a particular node. 3394 * We don't want to pressure a particular node.
3342 * So adding penalty to the first node in same 3395 * So adding penalty to the first node in same
3343 * distance group to make it round-robin. 3396 * distance group to make it round-robin.
3344 */ 3397 */
3345 if (distance != node_distance(local_node, prev_node)) 3398 if (node_distance(local_node, node) !=
3399 node_distance(local_node, prev_node))
3346 node_load[node] = load; 3400 node_load[node] = load;
3347 3401
3348 prev_node = node; 3402 prev_node = node;
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4438 4492
4439 zone->spanned_pages = size; 4493 zone->spanned_pages = size;
4440 zone->present_pages = realsize; 4494 zone->present_pages = realsize;
4441#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4442 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4443 zone->spanned_pages;
4444 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4445#endif
4446#ifdef CONFIG_NUMA 4495#ifdef CONFIG_NUMA
4447 zone->node = nid; 4496 zone->node = nid;
4448 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4497 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4521 4570
4522 pgdat->node_id = nid; 4571 pgdat->node_id = nid;
4523 pgdat->node_start_pfn = node_start_pfn; 4572 pgdat->node_start_pfn = node_start_pfn;
4573 init_zone_allows_reclaim(nid);
4524 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4574 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4525 4575
4526 alloc_node_mem_map(pgdat); 4576 alloc_node_mem_map(pgdat);
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4879 zone_movable_pfn[i] << PAGE_SHIFT); 4929 zone_movable_pfn[i] << PAGE_SHIFT);
4880 } 4930 }
4881 4931
4882 /* Print out the early_node_map[] */ 4932 /* Print out the early node map */
4883 printk("Early memory node ranges\n"); 4933 printk("Early memory node ranges\n");
4884 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4934 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4885 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 4935 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
5619 pageblock_nr_pages)); 5669 pageblock_nr_pages));
5620} 5670}
5621 5671
5622static struct page *
5623__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5624 int **resultp)
5625{
5626 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5627
5628 if (PageHighMem(page))
5629 gfp_mask |= __GFP_HIGHMEM;
5630
5631 return alloc_page(gfp_mask);
5632}
5633
5634/* [start, end) must belong to a single zone. */ 5672/* [start, end) must belong to a single zone. */
5635static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) 5673static int __alloc_contig_migrate_range(struct compact_control *cc,
5674 unsigned long start, unsigned long end)
5636{ 5675{
5637 /* This function is based on compact_zone() from compaction.c. */ 5676 /* This function is based on compact_zone() from compaction.c. */
5638 5677 unsigned long nr_reclaimed;
5639 unsigned long pfn = start; 5678 unsigned long pfn = start;
5640 unsigned int tries = 0; 5679 unsigned int tries = 0;
5641 int ret = 0; 5680 int ret = 0;
5642 5681
5643 struct compact_control cc = {
5644 .nr_migratepages = 0,
5645 .order = -1,
5646 .zone = page_zone(pfn_to_page(start)),
5647 .sync = true,
5648 };
5649 INIT_LIST_HEAD(&cc.migratepages);
5650
5651 migrate_prep_local(); 5682 migrate_prep_local();
5652 5683
5653 while (pfn < end || !list_empty(&cc.migratepages)) { 5684 while (pfn < end || !list_empty(&cc->migratepages)) {
5654 if (fatal_signal_pending(current)) { 5685 if (fatal_signal_pending(current)) {
5655 ret = -EINTR; 5686 ret = -EINTR;
5656 break; 5687 break;
5657 } 5688 }
5658 5689
5659 if (list_empty(&cc.migratepages)) { 5690 if (list_empty(&cc->migratepages)) {
5660 cc.nr_migratepages = 0; 5691 cc->nr_migratepages = 0;
5661 pfn = isolate_migratepages_range(cc.zone, &cc, 5692 pfn = isolate_migratepages_range(cc->zone, cc,
5662 pfn, end); 5693 pfn, end, true);
5663 if (!pfn) { 5694 if (!pfn) {
5664 ret = -EINTR; 5695 ret = -EINTR;
5665 break; 5696 break;
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5670 break; 5701 break;
5671 } 5702 }
5672 5703
5673 ret = migrate_pages(&cc.migratepages, 5704 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
5674 __alloc_contig_migrate_alloc, 5705 &cc->migratepages);
5706 cc->nr_migratepages -= nr_reclaimed;
5707
5708 ret = migrate_pages(&cc->migratepages,
5709 alloc_migrate_target,
5675 0, false, MIGRATE_SYNC); 5710 0, false, MIGRATE_SYNC);
5676 } 5711 }
5677 5712
5678 putback_lru_pages(&cc.migratepages); 5713 putback_lru_pages(&cc->migratepages);
5679 return ret > 0 ? 0 : ret; 5714 return ret > 0 ? 0 : ret;
5680} 5715}
5681 5716
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5754 unsigned long outer_start, outer_end; 5789 unsigned long outer_start, outer_end;
5755 int ret = 0, order; 5790 int ret = 0, order;
5756 5791
5792 struct compact_control cc = {
5793 .nr_migratepages = 0,
5794 .order = -1,
5795 .zone = page_zone(pfn_to_page(start)),
5796 .sync = true,
5797 .ignore_skip_hint = true,
5798 };
5799 INIT_LIST_HEAD(&cc.migratepages);
5800
5757 /* 5801 /*
5758 * What we do here is we mark all pageblocks in range as 5802 * What we do here is we mark all pageblocks in range as
5759 * MIGRATE_ISOLATE. Because pageblock and max order pages may 5803 * MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -5781,9 +5825,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5781 ret = start_isolate_page_range(pfn_max_align_down(start), 5825 ret = start_isolate_page_range(pfn_max_align_down(start),
5782 pfn_max_align_up(end), migratetype); 5826 pfn_max_align_up(end), migratetype);
5783 if (ret) 5827 if (ret)
5784 goto done; 5828 return ret;
5785 5829
5786 ret = __alloc_contig_migrate_range(start, end); 5830 ret = __alloc_contig_migrate_range(&cc, start, end);
5787 if (ret) 5831 if (ret)
5788 goto done; 5832 goto done;
5789 5833
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5832 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); 5876 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5833 5877
5834 /* Grab isolated pages from freelists. */ 5878 /* Grab isolated pages from freelists. */
5835 outer_end = isolate_freepages_range(outer_start, end); 5879 outer_end = isolate_freepages_range(&cc, outer_start, end);
5836 if (!outer_end) { 5880 if (!outer_end) {
5837 ret = -EBUSY; 5881 ret = -EBUSY;
5838 goto done; 5882 goto done;
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data)
5874 local_irq_save(flags); 5918 local_irq_save(flags);
5875 if (pcp->count > 0) 5919 if (pcp->count > 0)
5876 free_pcppages_bulk(zone, pcp->count, pcp); 5920 free_pcppages_bulk(zone, pcp->count, pcp);
5921 drain_zonestat(zone, pset);
5877 setup_pageset(pset, batch); 5922 setup_pageset(pset, batch);
5878 local_irq_restore(flags); 5923 local_irq_restore(flags);
5879 } 5924 }
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone)
5890void zone_pcp_reset(struct zone *zone) 5935void zone_pcp_reset(struct zone *zone)
5891{ 5936{
5892 unsigned long flags; 5937 unsigned long flags;
5938 int cpu;
5939 struct per_cpu_pageset *pset;
5893 5940
5894 /* avoid races with drain_pages() */ 5941 /* avoid races with drain_pages() */
5895 local_irq_save(flags); 5942 local_irq_save(flags);
5896 if (zone->pageset != &boot_pageset) { 5943 if (zone->pageset != &boot_pageset) {
5944 for_each_online_cpu(cpu) {
5945 pset = per_cpu_ptr(zone->pageset, cpu);
5946 drain_zonestat(zone, pset);
5947 }
5897 free_percpu(zone->pageset); 5948 free_percpu(zone->pageset);
5898 zone->pageset = &boot_pageset; 5949 zone->pageset = &boot_pageset;
5899 } 5950 }
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page)
6047 dump_page_flags(page->flags); 6098 dump_page_flags(page->flags);
6048 mem_cgroup_print_bad_page(page); 6099 mem_cgroup_print_bad_page(page);
6049} 6100}
6101
6102/* reset zone->present_pages */
6103void reset_zone_present_pages(void)
6104{
6105 struct zone *z;
6106 int i, nid;
6107
6108 for_each_node_state(nid, N_HIGH_MEMORY) {
6109 for (i = 0; i < MAX_NR_ZONES; i++) {
6110 z = NODE_DATA(nid)->node_zones + i;
6111 z->present_pages = 0;
6112 }
6113 }
6114}
6115
6116/* calculate zone's present pages in buddy system */
6117void fixup_zone_present_pages(int nid, unsigned long start_pfn,
6118 unsigned long end_pfn)
6119{
6120 struct zone *z;
6121 unsigned long zone_start_pfn, zone_end_pfn;
6122 int i;
6123
6124 for (i = 0; i < MAX_NR_ZONES; i++) {
6125 z = NODE_DATA(nid)->node_zones + i;
6126 zone_start_pfn = z->zone_start_pfn;
6127 zone_end_pfn = zone_start_pfn + z->spanned_pages;
6128
6129 /* if the two regions intersect */
6130 if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
6131 z->present_pages += min(end_pfn, zone_end_pfn) -
6132 max(start_pfn, zone_start_pfn);
6133 }
6134}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..f2f5b4818e94 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page)
76 76
77out: 77out:
78 if (!ret) { 78 if (!ret) {
79 unsigned long nr_pages;
80 int migratetype = get_pageblock_migratetype(page);
81
79 set_pageblock_isolate(page); 82 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE); 83 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
84
85 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
81 } 86 }
82 87
83 spin_unlock_irqrestore(&zone->lock, flags); 88 spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +94,14 @@ out:
89void unset_migratetype_isolate(struct page *page, unsigned migratetype) 94void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{ 95{
91 struct zone *zone; 96 struct zone *zone;
92 unsigned long flags; 97 unsigned long flags, nr_pages;
98
93 zone = page_zone(page); 99 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags); 100 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 101 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out; 102 goto out;
97 move_freepages_block(zone, page, migratetype); 103 nr_pages = move_freepages_block(zone, page, migratetype);
104 __mod_zone_freepage_state(zone, nr_pages, migratetype);
98 restore_pageblock_isolate(page, migratetype); 105 restore_pageblock_isolate(page, migratetype);
99out: 106out:
100 spin_unlock_irqrestore(&zone->lock, flags); 107 spin_unlock_irqrestore(&zone->lock, flags);
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
193 continue; 200 continue;
194 } 201 }
195 page = pfn_to_page(pfn); 202 page = pfn_to_page(pfn);
196 if (PageBuddy(page)) 203 if (PageBuddy(page)) {
204 /*
205 * If race between isolatation and allocation happens,
206 * some free pages could be in MIGRATE_MOVABLE list
207 * although pageblock's migratation type of the page
208 * is MIGRATE_ISOLATE. Catch it and move the page into
209 * MIGRATE_ISOLATE list.
210 */
211 if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
212 struct page *end_page;
213
214 end_page = page + (1 << page_order(page)) - 1;
215 move_freepages(page_zone(page), page, end_page,
216 MIGRATE_ISOLATE);
217 }
197 pfn += 1 << page_order(page); 218 pfn += 1 << page_order(page);
219 }
198 else if (page_count(page) == 0 && 220 else if (page_count(page) == 0 &&
199 page_private(page) == MIGRATE_ISOLATE) 221 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
200 pfn += 1; 222 pfn += 1;
201 else 223 else
202 break; 224 break;
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
233 spin_unlock_irqrestore(&zone->lock, flags); 255 spin_unlock_irqrestore(&zone->lock, flags);
234 return ret ? 0 : -EBUSY; 256 return ret ? 0 : -EBUSY;
235} 257}
258
259struct page *alloc_migrate_target(struct page *page, unsigned long private,
260 int **resultp)
261{
262 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
263
264 if (PageHighMem(page))
265 gfp_mask |= __GFP_HIGHMEM;
266
267 return alloc_page(gfp_mask);
268}
diff --git a/mm/percpu.c b/mm/percpu.c
index bb4be7435ce3..ddc5efb9c5bb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1370 1370
1371#ifdef CONFIG_SMP 1371#ifdef CONFIG_SMP
1372 1372
1373const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { 1373const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
1374 [PCPU_FC_AUTO] = "auto", 1374 [PCPU_FC_AUTO] = "auto",
1375 [PCPU_FC_EMBED] = "embed", 1375 [PCPU_FC_EMBED] = "embed",
1376 [PCPU_FC_PAGE] = "page", 1376 [PCPU_FC_PAGE] = "page",
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..e642627da6b7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
120} 120}
121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
122#endif 122#endif
123
124#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
125#ifdef CONFIG_TRANSPARENT_HUGEPAGE
126void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
127{
128 assert_spin_locked(&mm->page_table_lock);
129
130 /* FIFO */
131 if (!mm->pmd_huge_pte)
132 INIT_LIST_HEAD(&pgtable->lru);
133 else
134 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
135 mm->pmd_huge_pte = pgtable;
136}
137#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
138#endif
139
140#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
141#ifdef CONFIG_TRANSPARENT_HUGEPAGE
142/* no "address" argument so destroys page coloring of some arch */
143pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
144{
145 pgtable_t pgtable;
146
147 assert_spin_locked(&mm->page_table_lock);
148
149 /* FIFO */
150 pgtable = mm->pmd_huge_pte;
151 if (list_empty(&pgtable->lru))
152 mm->pmd_huge_pte = NULL;
153 else {
154 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
155 struct page, lru);
156 list_del(&pgtable->lru);
157 }
158 return pgtable;
159}
160#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
161#endif
162
163#ifndef __HAVE_ARCH_PMDP_INVALIDATE
164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
165void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
166 pmd_t *pmdp)
167{
168 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
169 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
170}
171#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
172#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
1/*
2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
3 *
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
5 *
6 * This file is released under the GPL v2.
7 *
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
10 *
11 * 02Feb2004 Initial version
12 */
13
14#include <linux/mm.h>
15#include <linux/prio_tree.h>
16#include <linux/prefetch.h>
17
18/*
19 * See lib/prio_tree.c for details on the general radix priority search tree
20 * code.
21 */
22
23/*
24 * The following #defines are mirrored from lib/prio_tree.c. They're only used
25 * for debugging, and should be removed (along with the debugging code using
26 * them) when switching also VMAs to the regular prio_tree code.
27 */
28
29#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
30#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
31/* avoid overflow */
32#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
33
34/*
35 * Radix priority search tree for address_space->i_mmap
36 *
37 * For each vma that map a unique set of file pages i.e., unique [radix_index,
38 * heap_index] value, we have a corresponding priority search tree node. If
39 * multiple vmas have identical [radix_index, heap_index] value, then one of
40 * them is used as a tree node and others are stored in a vm_set list. The tree
41 * node points to the first vma (head) of the list using vm_set.head.
42 *
43 * prio_tree_root
44 * |
45 * A vm_set.head
46 * / \ /
47 * L R -> H-I-J-K-M-N-O-P-Q-S
48 * ^ ^ <-- vm_set.list -->
49 * tree nodes
50 *
51 * We need some way to identify whether a vma is a tree node, head of a vm_set
52 * list, or just a member of a vm_set list. We cannot use vm_flags to store
53 * such information. The reason is, in the above figure, it is possible that
54 * vm_flags' of R and H are covered by the different mmap_sems. When R is
55 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
56 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
57 * That's why some trick involving shared.vm_set.parent is used for identifying
58 * tree nodes and list head nodes.
59 *
60 * vma radix priority search tree node rules:
61 *
62 * vma->shared.vm_set.parent != NULL ==> a tree node
63 * vma->shared.vm_set.head != NULL ==> list of others mapping same range
64 * vma->shared.vm_set.head == NULL ==> no others map the same range
65 *
66 * vma->shared.vm_set.parent == NULL
67 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
68 * vma->shared.vm_set.head == NULL ==> a list node
69 */
70
71/*
72 * Add a new vma known to map the same set of pages as the old vma:
73 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
74 * Note that it just happens to work correctly on i_mmap_nonlinear too.
75 */
76void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
77{
78 /* Leave these BUG_ONs till prio_tree patch stabilizes */
79 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
80 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
81
82 vma->shared.vm_set.head = NULL;
83 vma->shared.vm_set.parent = NULL;
84
85 if (!old->shared.vm_set.parent)
86 list_add(&vma->shared.vm_set.list,
87 &old->shared.vm_set.list);
88 else if (old->shared.vm_set.head)
89 list_add_tail(&vma->shared.vm_set.list,
90 &old->shared.vm_set.head->shared.vm_set.list);
91 else {
92 INIT_LIST_HEAD(&vma->shared.vm_set.list);
93 vma->shared.vm_set.head = old;
94 old->shared.vm_set.head = vma;
95 }
96}
97
98void vma_prio_tree_insert(struct vm_area_struct *vma,
99 struct prio_tree_root *root)
100{
101 struct prio_tree_node *ptr;
102 struct vm_area_struct *old;
103
104 vma->shared.vm_set.head = NULL;
105
106 ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
107 if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
108 old = prio_tree_entry(ptr, struct vm_area_struct,
109 shared.prio_tree_node);
110 vma_prio_tree_add(vma, old);
111 }
112}
113
114void vma_prio_tree_remove(struct vm_area_struct *vma,
115 struct prio_tree_root *root)
116{
117 struct vm_area_struct *node, *head, *new_head;
118
119 if (!vma->shared.vm_set.head) {
120 if (!vma->shared.vm_set.parent)
121 list_del_init(&vma->shared.vm_set.list);
122 else
123 raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
124 } else {
125 /* Leave this BUG_ON till prio_tree patch stabilizes */
126 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
127 if (vma->shared.vm_set.parent) {
128 head = vma->shared.vm_set.head;
129 if (!list_empty(&head->shared.vm_set.list)) {
130 new_head = list_entry(
131 head->shared.vm_set.list.next,
132 struct vm_area_struct,
133 shared.vm_set.list);
134 list_del_init(&head->shared.vm_set.list);
135 } else
136 new_head = NULL;
137
138 raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
139 &head->shared.prio_tree_node);
140 head->shared.vm_set.head = new_head;
141 if (new_head)
142 new_head->shared.vm_set.head = head;
143
144 } else {
145 node = vma->shared.vm_set.head;
146 if (!list_empty(&vma->shared.vm_set.list)) {
147 new_head = list_entry(
148 vma->shared.vm_set.list.next,
149 struct vm_area_struct,
150 shared.vm_set.list);
151 list_del_init(&vma->shared.vm_set.list);
152 node->shared.vm_set.head = new_head;
153 new_head->shared.vm_set.head = node;
154 } else
155 node->shared.vm_set.head = NULL;
156 }
157 }
158}
159
160/*
161 * Helper function to enumerate vmas that map a given file page or a set of
162 * contiguous file pages. The function returns vmas that at least map a single
163 * page in the given range of contiguous file pages.
164 */
165struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
166 struct prio_tree_iter *iter)
167{
168 struct prio_tree_node *ptr;
169 struct vm_area_struct *next;
170
171 if (!vma) {
172 /*
173 * First call is with NULL vma
174 */
175 ptr = prio_tree_next(iter);
176 if (ptr) {
177 next = prio_tree_entry(ptr, struct vm_area_struct,
178 shared.prio_tree_node);
179 prefetch(next->shared.vm_set.head);
180 return next;
181 } else
182 return NULL;
183 }
184
185 if (vma->shared.vm_set.parent) {
186 if (vma->shared.vm_set.head) {
187 next = vma->shared.vm_set.head;
188 prefetch(next->shared.vm_set.list.next);
189 return next;
190 }
191 } else {
192 next = list_entry(vma->shared.vm_set.list.next,
193 struct vm_area_struct, shared.vm_set.list);
194 if (!next->shared.vm_set.head) {
195 prefetch(next->shared.vm_set.list.next);
196 return next;
197 }
198 }
199
200 ptr = prio_tree_next(iter);
201 if (ptr) {
202 next = prio_tree_entry(ptr, struct vm_area_struct,
203 shared.prio_tree_node);
204 prefetch(next->shared.vm_set.head);
205 return next;
206 } else
207 return NULL;
208}
diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa21649..7963f2391236 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,
579SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) 579SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
580{ 580{
581 ssize_t ret; 581 ssize_t ret;
582 struct file *file; 582 struct fd f;
583 583
584 ret = -EBADF; 584 ret = -EBADF;
585 file = fget(fd); 585 f = fdget(fd);
586 if (file) { 586 if (f.file) {
587 if (file->f_mode & FMODE_READ) { 587 if (f.file->f_mode & FMODE_READ) {
588 struct address_space *mapping = file->f_mapping; 588 struct address_space *mapping = f.file->f_mapping;
589 pgoff_t start = offset >> PAGE_CACHE_SHIFT; 589 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
590 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 590 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
591 unsigned long len = end - start + 1; 591 unsigned long len = end - start + 1;
592 ret = do_readahead(mapping, file, start, len); 592 ret = do_readahead(mapping, f.file, start, len);
593 } 593 }
594 fput(file); 594 fdput(f);
595 } 595 }
596 return ret; 596 return ret;
597} 597}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..2ee1ef0f317b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/mmu_notifier.h> 56#include <linux/mmu_notifier.h>
57#include <linux/migrate.h> 57#include <linux/migrate.h>
58#include <linux/hugetlb.h> 58#include <linux/hugetlb.h>
59#include <linux/backing-dev.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -127,12 +128,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
127 avc->vma = vma; 128 avc->vma = vma;
128 avc->anon_vma = anon_vma; 129 avc->anon_vma = anon_vma;
129 list_add(&avc->same_vma, &vma->anon_vma_chain); 130 list_add(&avc->same_vma, &vma->anon_vma_chain);
130 131 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
131 /*
132 * It's critical to add new vmas to the tail of the anon_vma,
133 * see comment in huge_memory.c:__split_huge_page().
134 */
135 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
136} 132}
137 133
138/** 134/**
@@ -269,51 +265,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
269} 265}
270 266
271/* 267/*
272 * Some rmap walk that needs to find all ptes/hugepmds without false
273 * negatives (like migrate and split_huge_page) running concurrent
274 * with operations that copy or move pagetables (like mremap() and
275 * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
276 * list to be in a certain order: the dst_vma must be placed after the
277 * src_vma in the list. This is always guaranteed by fork() but
278 * mremap() needs to call this function to enforce it in case the
279 * dst_vma isn't newly allocated and chained with the anon_vma_clone()
280 * function but just an extension of a pre-existing vma through
281 * vma_merge.
282 *
283 * NOTE: the same_anon_vma list can still be changed by other
284 * processes while mremap runs because mremap doesn't hold the
285 * anon_vma mutex to prevent modifications to the list while it
286 * runs. All we need to enforce is that the relative order of this
287 * process vmas isn't changing (we don't care about other vmas
288 * order). Each vma corresponds to an anon_vma_chain structure so
289 * there's no risk that other processes calling anon_vma_moveto_tail()
290 * and changing the same_anon_vma list under mremap() will screw with
291 * the relative order of this process vmas in the list, because we
292 * they can't alter the order of any vma that belongs to this
293 * process. And there can't be another anon_vma_moveto_tail() running
294 * concurrently with mremap() coming from this process because we hold
295 * the mmap_sem for the whole mremap(). fork() ordering dependency
296 * also shouldn't be affected because fork() only cares that the
297 * parent vmas are placed in the list before the child vmas and
298 * anon_vma_moveto_tail() won't reorder vmas from either the fork()
299 * parent or child.
300 */
301void anon_vma_moveto_tail(struct vm_area_struct *dst)
302{
303 struct anon_vma_chain *pavc;
304 struct anon_vma *root = NULL;
305
306 list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
307 struct anon_vma *anon_vma = pavc->anon_vma;
308 VM_BUG_ON(pavc->vma != dst);
309 root = lock_anon_vma_root(root, anon_vma);
310 list_del(&pavc->same_anon_vma);
311 list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
312 }
313 unlock_anon_vma_root(root);
314}
315
316/*
317 * Attach vma to its own anon_vma, as well as to the anon_vmas that 268 * Attach vma to its own anon_vma, as well as to the anon_vmas that
318 * the corresponding VMA in the parent process is attached to. 269 * the corresponding VMA in the parent process is attached to.
319 * Returns 0 on success, non-zero on failure. 270 * Returns 0 on success, non-zero on failure.
@@ -381,13 +332,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
381 struct anon_vma *anon_vma = avc->anon_vma; 332 struct anon_vma *anon_vma = avc->anon_vma;
382 333
383 root = lock_anon_vma_root(root, anon_vma); 334 root = lock_anon_vma_root(root, anon_vma);
384 list_del(&avc->same_anon_vma); 335 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
385 336
386 /* 337 /*
387 * Leave empty anon_vmas on the list - we'll need 338 * Leave empty anon_vmas on the list - we'll need
388 * to free them outside the lock. 339 * to free them outside the lock.
389 */ 340 */
390 if (list_empty(&anon_vma->head)) 341 if (RB_EMPTY_ROOT(&anon_vma->rb_root))
391 continue; 342 continue;
392 343
393 list_del(&avc->same_vma); 344 list_del(&avc->same_vma);
@@ -416,7 +367,7 @@ static void anon_vma_ctor(void *data)
416 367
417 mutex_init(&anon_vma->mutex); 368 mutex_init(&anon_vma->mutex);
418 atomic_set(&anon_vma->refcount, 0); 369 atomic_set(&anon_vma->refcount, 0);
419 INIT_LIST_HEAD(&anon_vma->head); 370 anon_vma->rb_root = RB_ROOT;
420} 371}
421 372
422void __init anon_vma_init(void) 373void __init anon_vma_init(void)
@@ -560,22 +511,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
560 511
561/* 512/*
562 * At what user virtual address is page expected in @vma? 513 * At what user virtual address is page expected in @vma?
563 * Returns virtual address or -EFAULT if page's index/offset is not
564 * within the range mapped the @vma.
565 */ 514 */
566inline unsigned long 515static inline unsigned long
567vma_address(struct page *page, struct vm_area_struct *vma) 516__vma_address(struct page *page, struct vm_area_struct *vma)
568{ 517{
569 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 518 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
570 unsigned long address;
571 519
572 if (unlikely(is_vm_hugetlb_page(vma))) 520 if (unlikely(is_vm_hugetlb_page(vma)))
573 pgoff = page->index << huge_page_order(page_hstate(page)); 521 pgoff = page->index << huge_page_order(page_hstate(page));
574 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 522
575 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 523 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
576 /* page should be within @vma mapping range */ 524}
577 return -EFAULT; 525
578 } 526inline unsigned long
527vma_address(struct page *page, struct vm_area_struct *vma)
528{
529 unsigned long address = __vma_address(page, vma);
530
531 /* page should be within @vma mapping range */
532 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
533
579 return address; 534 return address;
580} 535}
581 536
@@ -585,6 +540,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
585 */ 540 */
586unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 541unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
587{ 542{
543 unsigned long address;
588 if (PageAnon(page)) { 544 if (PageAnon(page)) {
589 struct anon_vma *page__anon_vma = page_anon_vma(page); 545 struct anon_vma *page__anon_vma = page_anon_vma(page);
590 /* 546 /*
@@ -600,7 +556,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
600 return -EFAULT; 556 return -EFAULT;
601 } else 557 } else
602 return -EFAULT; 558 return -EFAULT;
603 return vma_address(page, vma); 559 address = __vma_address(page, vma);
560 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
561 return -EFAULT;
562 return address;
604} 563}
605 564
606/* 565/*
@@ -674,8 +633,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
674 pte_t *pte; 633 pte_t *pte;
675 spinlock_t *ptl; 634 spinlock_t *ptl;
676 635
677 address = vma_address(page, vma); 636 address = __vma_address(page, vma);
678 if (address == -EFAULT) /* out of vma range */ 637 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
679 return 0; 638 return 0;
680 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); 639 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
681 if (!pte) /* the page is not in this mm */ 640 if (!pte) /* the page is not in this mm */
@@ -769,6 +728,7 @@ static int page_referenced_anon(struct page *page,
769{ 728{
770 unsigned int mapcount; 729 unsigned int mapcount;
771 struct anon_vma *anon_vma; 730 struct anon_vma *anon_vma;
731 pgoff_t pgoff;
772 struct anon_vma_chain *avc; 732 struct anon_vma_chain *avc;
773 int referenced = 0; 733 int referenced = 0;
774 734
@@ -777,11 +737,10 @@ static int page_referenced_anon(struct page *page,
777 return referenced; 737 return referenced;
778 738
779 mapcount = page_mapcount(page); 739 mapcount = page_mapcount(page);
780 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 740 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
741 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
781 struct vm_area_struct *vma = avc->vma; 742 struct vm_area_struct *vma = avc->vma;
782 unsigned long address = vma_address(page, vma); 743 unsigned long address = vma_address(page, vma);
783 if (address == -EFAULT)
784 continue;
785 /* 744 /*
786 * If we are reclaiming on behalf of a cgroup, skip 745 * If we are reclaiming on behalf of a cgroup, skip
787 * counting on behalf of references from different 746 * counting on behalf of references from different
@@ -820,7 +779,6 @@ static int page_referenced_file(struct page *page,
820 struct address_space *mapping = page->mapping; 779 struct address_space *mapping = page->mapping;
821 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 780 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
822 struct vm_area_struct *vma; 781 struct vm_area_struct *vma;
823 struct prio_tree_iter iter;
824 int referenced = 0; 782 int referenced = 0;
825 783
826 /* 784 /*
@@ -846,10 +804,8 @@ static int page_referenced_file(struct page *page,
846 */ 804 */
847 mapcount = page_mapcount(page); 805 mapcount = page_mapcount(page);
848 806
849 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 807 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
850 unsigned long address = vma_address(page, vma); 808 unsigned long address = vma_address(page, vma);
851 if (address == -EFAULT)
852 continue;
853 /* 809 /*
854 * If we are reclaiming on behalf of a cgroup, skip 810 * If we are reclaiming on behalf of a cgroup, skip
855 * counting on behalf of references from different 811 * counting on behalf of references from different
@@ -929,7 +885,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
929 pte_t entry; 885 pte_t entry;
930 886
931 flush_cache_page(vma, address, pte_pfn(*pte)); 887 flush_cache_page(vma, address, pte_pfn(*pte));
932 entry = ptep_clear_flush_notify(vma, address, pte); 888 entry = ptep_clear_flush(vma, address, pte);
933 entry = pte_wrprotect(entry); 889 entry = pte_wrprotect(entry);
934 entry = pte_mkclean(entry); 890 entry = pte_mkclean(entry);
935 set_pte_at(mm, address, pte, entry); 891 set_pte_at(mm, address, pte, entry);
@@ -937,6 +893,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
937 } 893 }
938 894
939 pte_unmap_unlock(pte, ptl); 895 pte_unmap_unlock(pte, ptl);
896
897 if (ret)
898 mmu_notifier_invalidate_page(mm, address);
940out: 899out:
941 return ret; 900 return ret;
942} 901}
@@ -945,17 +904,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
945{ 904{
946 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 905 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
947 struct vm_area_struct *vma; 906 struct vm_area_struct *vma;
948 struct prio_tree_iter iter;
949 int ret = 0; 907 int ret = 0;
950 908
951 BUG_ON(PageAnon(page)); 909 BUG_ON(PageAnon(page));
952 910
953 mutex_lock(&mapping->i_mmap_mutex); 911 mutex_lock(&mapping->i_mmap_mutex);
954 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 912 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
955 if (vma->vm_flags & VM_SHARED) { 913 if (vma->vm_flags & VM_SHARED) {
956 unsigned long address = vma_address(page, vma); 914 unsigned long address = vma_address(page, vma);
957 if (address == -EFAULT)
958 continue;
959 ret += page_mkclean_one(page, vma, address); 915 ret += page_mkclean_one(page, vma, address);
960 } 916 }
961 } 917 }
@@ -971,11 +927,8 @@ int page_mkclean(struct page *page)
971 927
972 if (page_mapped(page)) { 928 if (page_mapped(page)) {
973 struct address_space *mapping = page_mapping(page); 929 struct address_space *mapping = page_mapping(page);
974 if (mapping) { 930 if (mapping)
975 ret = page_mkclean_file(mapping, page); 931 ret = page_mkclean_file(mapping, page);
976 if (page_test_and_clear_dirty(page_to_pfn(page), 1))
977 ret = 1;
978 }
979 } 932 }
980 933
981 return ret; 934 return ret;
@@ -1128,7 +1081,7 @@ void page_add_new_anon_rmap(struct page *page,
1128 else 1081 else
1129 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1082 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1130 __page_set_anon_rmap(page, vma, address, 1); 1083 __page_set_anon_rmap(page, vma, address, 1);
1131 if (page_evictable(page, vma)) 1084 if (!mlocked_vma_newpage(vma, page))
1132 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1085 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
1133 else 1086 else
1134 add_page_to_unevictable_list(page); 1087 add_page_to_unevictable_list(page);
@@ -1161,6 +1114,7 @@ void page_add_file_rmap(struct page *page)
1161 */ 1114 */
1162void page_remove_rmap(struct page *page) 1115void page_remove_rmap(struct page *page)
1163{ 1116{
1117 struct address_space *mapping = page_mapping(page);
1164 bool anon = PageAnon(page); 1118 bool anon = PageAnon(page);
1165 bool locked; 1119 bool locked;
1166 unsigned long flags; 1120 unsigned long flags;
@@ -1183,8 +1137,19 @@ void page_remove_rmap(struct page *page)
1183 * this if the page is anon, so about to be freed; but perhaps 1137 * this if the page is anon, so about to be freed; but perhaps
1184 * not if it's in swapcache - there might be another pte slot 1138 * not if it's in swapcache - there might be another pte slot
1185 * containing the swap entry, but page not yet written to swap. 1139 * containing the swap entry, but page not yet written to swap.
1140 *
1141 * And we can skip it on file pages, so long as the filesystem
1142 * participates in dirty tracking; but need to catch shm and tmpfs
1143 * and ramfs pages which have been modified since creation by read
1144 * fault.
1145 *
1146 * Note that mapping must be decided above, before decrementing
1147 * mapcount (which luckily provides a barrier): once page is unmapped,
1148 * it could be truncated and page->mapping reset to NULL at any moment.
1149 * Note also that we are relying on page_mapping(page) to set mapping
1150 * to &swapper_space when PageSwapCache(page).
1186 */ 1151 */
1187 if ((!anon || PageSwapCache(page)) && 1152 if (mapping && !mapping_cap_account_dirty(mapping) &&
1188 page_test_and_clear_dirty(page_to_pfn(page), 1)) 1153 page_test_and_clear_dirty(page_to_pfn(page), 1))
1189 set_page_dirty(page); 1154 set_page_dirty(page);
1190 /* 1155 /*
@@ -1203,7 +1168,10 @@ void page_remove_rmap(struct page *page)
1203 } else { 1168 } else {
1204 __dec_zone_page_state(page, NR_FILE_MAPPED); 1169 __dec_zone_page_state(page, NR_FILE_MAPPED);
1205 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); 1170 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
1171 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1206 } 1172 }
1173 if (unlikely(PageMlocked(page)))
1174 clear_page_mlock(page);
1207 /* 1175 /*
1208 * It would be tidy to reset the PageAnon mapping here, 1176 * It would be tidy to reset the PageAnon mapping here,
1209 * but that might overwrite a racing page_add_anon_rmap 1177 * but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1181,7 @@ void page_remove_rmap(struct page *page)
1213 * Leaving it set also helps swapoff to reinstate ptes 1181 * Leaving it set also helps swapoff to reinstate ptes
1214 * faster for those pages still in swapcache. 1182 * faster for those pages still in swapcache.
1215 */ 1183 */
1184 return;
1216out: 1185out:
1217 if (!anon) 1186 if (!anon)
1218 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1187 mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1225,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1256 1225
1257 /* Nuke the page table entry. */ 1226 /* Nuke the page table entry. */
1258 flush_cache_page(vma, address, page_to_pfn(page)); 1227 flush_cache_page(vma, address, page_to_pfn(page));
1259 pteval = ptep_clear_flush_notify(vma, address, pte); 1228 pteval = ptep_clear_flush(vma, address, pte);
1260 1229
1261 /* Move the dirty bit to the physical page now the pte is gone. */ 1230 /* Move the dirty bit to the physical page now the pte is gone. */
1262 if (pte_dirty(pteval)) 1231 if (pte_dirty(pteval))
@@ -1318,6 +1287,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1318 1287
1319out_unmap: 1288out_unmap:
1320 pte_unmap_unlock(pte, ptl); 1289 pte_unmap_unlock(pte, ptl);
1290 if (ret != SWAP_FAIL)
1291 mmu_notifier_invalidate_page(mm, address);
1321out: 1292out:
1322 return ret; 1293 return ret;
1323 1294
@@ -1382,6 +1353,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1382 spinlock_t *ptl; 1353 spinlock_t *ptl;
1383 struct page *page; 1354 struct page *page;
1384 unsigned long address; 1355 unsigned long address;
1356 unsigned long mmun_start; /* For mmu_notifiers */
1357 unsigned long mmun_end; /* For mmu_notifiers */
1385 unsigned long end; 1358 unsigned long end;
1386 int ret = SWAP_AGAIN; 1359 int ret = SWAP_AGAIN;
1387 int locked_vma = 0; 1360 int locked_vma = 0;
@@ -1405,6 +1378,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1405 if (!pmd_present(*pmd)) 1378 if (!pmd_present(*pmd))
1406 return ret; 1379 return ret;
1407 1380
1381 mmun_start = address;
1382 mmun_end = end;
1383 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1384
1408 /* 1385 /*
1409 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1386 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1410 * keep the sem while scanning the cluster for mlocking pages. 1387 * keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1415,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1438 1415
1439 /* Nuke the page table entry. */ 1416 /* Nuke the page table entry. */
1440 flush_cache_page(vma, address, pte_pfn(*pte)); 1417 flush_cache_page(vma, address, pte_pfn(*pte));
1441 pteval = ptep_clear_flush_notify(vma, address, pte); 1418 pteval = ptep_clear_flush(vma, address, pte);
1442 1419
1443 /* If nonlinear, store the file page offset in the pte. */ 1420 /* If nonlinear, store the file page offset in the pte. */
1444 if (page->index != linear_page_index(vma, address)) 1421 if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1431,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1454 (*mapcount)--; 1431 (*mapcount)--;
1455 } 1432 }
1456 pte_unmap_unlock(pte - 1, ptl); 1433 pte_unmap_unlock(pte - 1, ptl);
1434 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1457 if (locked_vma) 1435 if (locked_vma)
1458 up_read(&vma->vm_mm->mmap_sem); 1436 up_read(&vma->vm_mm->mmap_sem);
1459 return ret; 1437 return ret;
@@ -1492,6 +1470,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
1492static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1470static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1493{ 1471{
1494 struct anon_vma *anon_vma; 1472 struct anon_vma *anon_vma;
1473 pgoff_t pgoff;
1495 struct anon_vma_chain *avc; 1474 struct anon_vma_chain *avc;
1496 int ret = SWAP_AGAIN; 1475 int ret = SWAP_AGAIN;
1497 1476
@@ -1499,7 +1478,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1499 if (!anon_vma) 1478 if (!anon_vma)
1500 return ret; 1479 return ret;
1501 1480
1502 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1481 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1482 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1503 struct vm_area_struct *vma = avc->vma; 1483 struct vm_area_struct *vma = avc->vma;
1504 unsigned long address; 1484 unsigned long address;
1505 1485
@@ -1516,8 +1496,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1516 continue; 1496 continue;
1517 1497
1518 address = vma_address(page, vma); 1498 address = vma_address(page, vma);
1519 if (address == -EFAULT)
1520 continue;
1521 ret = try_to_unmap_one(page, vma, address, flags); 1499 ret = try_to_unmap_one(page, vma, address, flags);
1522 if (ret != SWAP_AGAIN || !page_mapped(page)) 1500 if (ret != SWAP_AGAIN || !page_mapped(page))
1523 break; 1501 break;
@@ -1547,7 +1525,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1547 struct address_space *mapping = page->mapping; 1525 struct address_space *mapping = page->mapping;
1548 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1526 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1549 struct vm_area_struct *vma; 1527 struct vm_area_struct *vma;
1550 struct prio_tree_iter iter;
1551 int ret = SWAP_AGAIN; 1528 int ret = SWAP_AGAIN;
1552 unsigned long cursor; 1529 unsigned long cursor;
1553 unsigned long max_nl_cursor = 0; 1530 unsigned long max_nl_cursor = 0;
@@ -1555,10 +1532,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1555 unsigned int mapcount; 1532 unsigned int mapcount;
1556 1533
1557 mutex_lock(&mapping->i_mmap_mutex); 1534 mutex_lock(&mapping->i_mmap_mutex);
1558 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1535 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1559 unsigned long address = vma_address(page, vma); 1536 unsigned long address = vma_address(page, vma);
1560 if (address == -EFAULT)
1561 continue;
1562 ret = try_to_unmap_one(page, vma, address, flags); 1537 ret = try_to_unmap_one(page, vma, address, flags);
1563 if (ret != SWAP_AGAIN || !page_mapped(page)) 1538 if (ret != SWAP_AGAIN || !page_mapped(page))
1564 goto out; 1539 goto out;
@@ -1576,7 +1551,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1576 goto out; 1551 goto out;
1577 1552
1578 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1553 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1579 shared.vm_set.list) { 1554 shared.nonlinear) {
1580 cursor = (unsigned long) vma->vm_private_data; 1555 cursor = (unsigned long) vma->vm_private_data;
1581 if (cursor > max_nl_cursor) 1556 if (cursor > max_nl_cursor)
1582 max_nl_cursor = cursor; 1557 max_nl_cursor = cursor;
@@ -1608,7 +1583,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1608 1583
1609 do { 1584 do {
1610 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1585 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1611 shared.vm_set.list) { 1586 shared.nonlinear) {
1612 cursor = (unsigned long) vma->vm_private_data; 1587 cursor = (unsigned long) vma->vm_private_data;
1613 while ( cursor < max_nl_cursor && 1588 while ( cursor < max_nl_cursor &&
1614 cursor < vma->vm_end - vma->vm_start) { 1589 cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1606,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1631 * in locked vmas). Reset cursor on all unreserved nonlinear 1606 * in locked vmas). Reset cursor on all unreserved nonlinear
1632 * vmas, now forgetting on which ones it had fallen behind. 1607 * vmas, now forgetting on which ones it had fallen behind.
1633 */ 1608 */
1634 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1609 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1635 vma->vm_private_data = NULL; 1610 vma->vm_private_data = NULL;
1636out: 1611out:
1637 mutex_unlock(&mapping->i_mmap_mutex); 1612 mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1691,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1716 struct vm_area_struct *, unsigned long, void *), void *arg) 1691 struct vm_area_struct *, unsigned long, void *), void *arg)
1717{ 1692{
1718 struct anon_vma *anon_vma; 1693 struct anon_vma *anon_vma;
1694 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1719 struct anon_vma_chain *avc; 1695 struct anon_vma_chain *avc;
1720 int ret = SWAP_AGAIN; 1696 int ret = SWAP_AGAIN;
1721 1697
@@ -1729,11 +1705,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1729 if (!anon_vma) 1705 if (!anon_vma)
1730 return ret; 1706 return ret;
1731 anon_vma_lock(anon_vma); 1707 anon_vma_lock(anon_vma);
1732 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1708 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1733 struct vm_area_struct *vma = avc->vma; 1709 struct vm_area_struct *vma = avc->vma;
1734 unsigned long address = vma_address(page, vma); 1710 unsigned long address = vma_address(page, vma);
1735 if (address == -EFAULT)
1736 continue;
1737 ret = rmap_one(page, vma, address, arg); 1711 ret = rmap_one(page, vma, address, arg);
1738 if (ret != SWAP_AGAIN) 1712 if (ret != SWAP_AGAIN)
1739 break; 1713 break;
@@ -1748,16 +1722,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1748 struct address_space *mapping = page->mapping; 1722 struct address_space *mapping = page->mapping;
1749 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1723 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1750 struct vm_area_struct *vma; 1724 struct vm_area_struct *vma;
1751 struct prio_tree_iter iter;
1752 int ret = SWAP_AGAIN; 1725 int ret = SWAP_AGAIN;
1753 1726
1754 if (!mapping) 1727 if (!mapping)
1755 return ret; 1728 return ret;
1756 mutex_lock(&mapping->i_mmap_mutex); 1729 mutex_lock(&mapping->i_mmap_mutex);
1757 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1730 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1758 unsigned long address = vma_address(page, vma); 1731 unsigned long address = vma_address(page, vma);
1759 if (address == -EFAULT)
1760 continue;
1761 ret = rmap_one(page, vma, address, arg); 1732 ret = rmap_one(page, vma, address, arg);
1762 if (ret != SWAP_AGAIN) 1733 if (ret != SWAP_AGAIN)
1763 break; 1734 break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d4e184e2a38e..67afba5117f2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt;
77/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 77/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
78#define SHORT_SYMLINK_LEN 128 78#define SHORT_SYMLINK_LEN 128
79 79
80struct shmem_xattr {
81 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
82 char *name; /* xattr name */
83 size_t size;
84 char value[0];
85};
86
87/* 80/*
88 * shmem_fallocate and shmem_writepage communicate via inode->i_private 81 * shmem_fallocate and shmem_writepage communicate via inode->i_private
89 * (with i_mutex making sure that it has only one user at a time): 82 * (with i_mutex making sure that it has only one user at a time):
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
636static void shmem_evict_inode(struct inode *inode) 629static void shmem_evict_inode(struct inode *inode)
637{ 630{
638 struct shmem_inode_info *info = SHMEM_I(inode); 631 struct shmem_inode_info *info = SHMEM_I(inode);
639 struct shmem_xattr *xattr, *nxattr;
640 632
641 if (inode->i_mapping->a_ops == &shmem_aops) { 633 if (inode->i_mapping->a_ops == &shmem_aops) {
642 shmem_unacct_size(info->flags, inode->i_size); 634 shmem_unacct_size(info->flags, inode->i_size);
@@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode)
650 } else 642 } else
651 kfree(info->symlink); 643 kfree(info->symlink);
652 644
653 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 645 simple_xattrs_free(&info->xattrs);
654 kfree(xattr->name);
655 kfree(xattr);
656 }
657 BUG_ON(inode->i_blocks); 646 BUG_ON(inode->i_blocks);
658 shmem_free_inode(inode->i_sb); 647 shmem_free_inode(inode->i_sb);
659 clear_inode(inode); 648 clear_inode(inode);
@@ -1350,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1350{ 1339{
1351 file_accessed(file); 1340 file_accessed(file);
1352 vma->vm_ops = &shmem_vm_ops; 1341 vma->vm_ops = &shmem_vm_ops;
1353 vma->vm_flags |= VM_CAN_NONLINEAR;
1354 return 0; 1342 return 0;
1355} 1343}
1356 1344
@@ -1377,7 +1365,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1377 spin_lock_init(&info->lock); 1365 spin_lock_init(&info->lock);
1378 info->flags = flags & VM_NORESERVE; 1366 info->flags = flags & VM_NORESERVE;
1379 INIT_LIST_HEAD(&info->swaplist); 1367 INIT_LIST_HEAD(&info->swaplist);
1380 INIT_LIST_HEAD(&info->xattr_list); 1368 simple_xattrs_init(&info->xattrs);
1381 cache_no_acl(inode); 1369 cache_no_acl(inode);
1382 1370
1383 switch (mode & S_IFMT) { 1371 switch (mode & S_IFMT) {
@@ -2060,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
2060 */ 2048 */
2061 2049
2062/* 2050/*
2063 * Allocate new xattr and copy in the value; but leave the name to callers.
2064 */
2065static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
2066{
2067 struct shmem_xattr *new_xattr;
2068 size_t len;
2069
2070 /* wrap around? */
2071 len = sizeof(*new_xattr) + size;
2072 if (len <= sizeof(*new_xattr))
2073 return NULL;
2074
2075 new_xattr = kmalloc(len, GFP_KERNEL);
2076 if (!new_xattr)
2077 return NULL;
2078
2079 new_xattr->size = size;
2080 memcpy(new_xattr->value, value, size);
2081 return new_xattr;
2082}
2083
2084/*
2085 * Callback for security_inode_init_security() for acquiring xattrs. 2051 * Callback for security_inode_init_security() for acquiring xattrs.
2086 */ 2052 */
2087static int shmem_initxattrs(struct inode *inode, 2053static int shmem_initxattrs(struct inode *inode,
@@ -2090,11 +2056,11 @@ static int shmem_initxattrs(struct inode *inode,
2090{ 2056{
2091 struct shmem_inode_info *info = SHMEM_I(inode); 2057 struct shmem_inode_info *info = SHMEM_I(inode);
2092 const struct xattr *xattr; 2058 const struct xattr *xattr;
2093 struct shmem_xattr *new_xattr; 2059 struct simple_xattr *new_xattr;
2094 size_t len; 2060 size_t len;
2095 2061
2096 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 2062 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
2097 new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); 2063 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
2098 if (!new_xattr) 2064 if (!new_xattr)
2099 return -ENOMEM; 2065 return -ENOMEM;
2100 2066
@@ -2111,91 +2077,12 @@ static int shmem_initxattrs(struct inode *inode,
2111 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 2077 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
2112 xattr->name, len); 2078 xattr->name, len);
2113 2079
2114 spin_lock(&info->lock); 2080 simple_xattr_list_add(&info->xattrs, new_xattr);
2115 list_add(&new_xattr->list, &info->xattr_list);
2116 spin_unlock(&info->lock);
2117 } 2081 }
2118 2082
2119 return 0; 2083 return 0;
2120} 2084}
2121 2085
2122static int shmem_xattr_get(struct dentry *dentry, const char *name,
2123 void *buffer, size_t size)
2124{
2125 struct shmem_inode_info *info;
2126 struct shmem_xattr *xattr;
2127 int ret = -ENODATA;
2128
2129 info = SHMEM_I(dentry->d_inode);
2130
2131 spin_lock(&info->lock);
2132 list_for_each_entry(xattr, &info->xattr_list, list) {
2133 if (strcmp(name, xattr->name))
2134 continue;
2135
2136 ret = xattr->size;
2137 if (buffer) {
2138 if (size < xattr->size)
2139 ret = -ERANGE;
2140 else
2141 memcpy(buffer, xattr->value, xattr->size);
2142 }
2143 break;
2144 }
2145 spin_unlock(&info->lock);
2146 return ret;
2147}
2148
2149static int shmem_xattr_set(struct inode *inode, const char *name,
2150 const void *value, size_t size, int flags)
2151{
2152 struct shmem_inode_info *info = SHMEM_I(inode);
2153 struct shmem_xattr *xattr;
2154 struct shmem_xattr *new_xattr = NULL;
2155 int err = 0;
2156
2157 /* value == NULL means remove */
2158 if (value) {
2159 new_xattr = shmem_xattr_alloc(value, size);
2160 if (!new_xattr)
2161 return -ENOMEM;
2162
2163 new_xattr->name = kstrdup(name, GFP_KERNEL);
2164 if (!new_xattr->name) {
2165 kfree(new_xattr);
2166 return -ENOMEM;
2167 }
2168 }
2169
2170 spin_lock(&info->lock);
2171 list_for_each_entry(xattr, &info->xattr_list, list) {
2172 if (!strcmp(name, xattr->name)) {
2173 if (flags & XATTR_CREATE) {
2174 xattr = new_xattr;
2175 err = -EEXIST;
2176 } else if (new_xattr) {
2177 list_replace(&xattr->list, &new_xattr->list);
2178 } else {
2179 list_del(&xattr->list);
2180 }
2181 goto out;
2182 }
2183 }
2184 if (flags & XATTR_REPLACE) {
2185 xattr = new_xattr;
2186 err = -ENODATA;
2187 } else {
2188 list_add(&new_xattr->list, &info->xattr_list);
2189 xattr = NULL;
2190 }
2191out:
2192 spin_unlock(&info->lock);
2193 if (xattr)
2194 kfree(xattr->name);
2195 kfree(xattr);
2196 return err;
2197}
2198
2199static const struct xattr_handler *shmem_xattr_handlers[] = { 2086static const struct xattr_handler *shmem_xattr_handlers[] = {
2200#ifdef CONFIG_TMPFS_POSIX_ACL 2087#ifdef CONFIG_TMPFS_POSIX_ACL
2201 &generic_acl_access_handler, 2088 &generic_acl_access_handler,
@@ -2226,6 +2113,7 @@ static int shmem_xattr_validate(const char *name)
2226static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 2113static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2227 void *buffer, size_t size) 2114 void *buffer, size_t size)
2228{ 2115{
2116 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2229 int err; 2117 int err;
2230 2118
2231 /* 2119 /*
@@ -2240,12 +2128,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2240 if (err) 2128 if (err)
2241 return err; 2129 return err;
2242 2130
2243 return shmem_xattr_get(dentry, name, buffer, size); 2131 return simple_xattr_get(&info->xattrs, name, buffer, size);
2244} 2132}
2245 2133
2246static int shmem_setxattr(struct dentry *dentry, const char *name, 2134static int shmem_setxattr(struct dentry *dentry, const char *name,
2247 const void *value, size_t size, int flags) 2135 const void *value, size_t size, int flags)
2248{ 2136{
2137 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2249 int err; 2138 int err;
2250 2139
2251 /* 2140 /*
@@ -2260,15 +2149,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
2260 if (err) 2149 if (err)
2261 return err; 2150 return err;
2262 2151
2263 if (size == 0) 2152 return simple_xattr_set(&info->xattrs, name, value, size, flags);
2264 value = ""; /* empty EA, do not remove */
2265
2266 return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
2267
2268} 2153}
2269 2154
2270static int shmem_removexattr(struct dentry *dentry, const char *name) 2155static int shmem_removexattr(struct dentry *dentry, const char *name)
2271{ 2156{
2157 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2272 int err; 2158 int err;
2273 2159
2274 /* 2160 /*
@@ -2283,45 +2169,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
2283 if (err) 2169 if (err)
2284 return err; 2170 return err;
2285 2171
2286 return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 2172 return simple_xattr_remove(&info->xattrs, name);
2287}
2288
2289static bool xattr_is_trusted(const char *name)
2290{
2291 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
2292} 2173}
2293 2174
2294static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2175static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2295{ 2176{
2296 bool trusted = capable(CAP_SYS_ADMIN); 2177 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
2297 struct shmem_xattr *xattr; 2178 return simple_xattr_list(&info->xattrs, buffer, size);
2298 struct shmem_inode_info *info;
2299 size_t used = 0;
2300
2301 info = SHMEM_I(dentry->d_inode);
2302
2303 spin_lock(&info->lock);
2304 list_for_each_entry(xattr, &info->xattr_list, list) {
2305 size_t len;
2306
2307 /* skip "trusted." attributes for unprivileged callers */
2308 if (!trusted && xattr_is_trusted(xattr->name))
2309 continue;
2310
2311 len = strlen(xattr->name) + 1;
2312 used += len;
2313 if (buffer) {
2314 if (size < used) {
2315 used = -ERANGE;
2316 break;
2317 }
2318 memcpy(buffer, xattr->name, len);
2319 buffer += len;
2320 }
2321 }
2322 spin_unlock(&info->lock);
2323
2324 return used;
2325} 2179}
2326#endif /* CONFIG_TMPFS_XATTR */ 2180#endif /* CONFIG_TMPFS_XATTR */
2327 2181
@@ -2366,12 +2220,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2366{ 2220{
2367 struct inode *inode; 2221 struct inode *inode;
2368 struct dentry *dentry = NULL; 2222 struct dentry *dentry = NULL;
2369 u64 inum = fid->raw[2]; 2223 u64 inum;
2370 inum = (inum << 32) | fid->raw[1];
2371 2224
2372 if (fh_len < 3) 2225 if (fh_len < 3)
2373 return NULL; 2226 return NULL;
2374 2227
2228 inum = fid->raw[2];
2229 inum = (inum << 32) | fid->raw[1];
2230
2375 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 2231 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2376 shmem_match, fid->raw); 2232 shmem_match, fid->raw);
2377 if (inode) { 2233 if (inode) {
@@ -2788,6 +2644,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
2788 .set_policy = shmem_set_policy, 2644 .set_policy = shmem_set_policy,
2789 .get_policy = shmem_get_policy, 2645 .get_policy = shmem_get_policy,
2790#endif 2646#endif
2647 .remap_pages = generic_file_remap_pages,
2791}; 2648};
2792 2649
2793static struct dentry *shmem_mount(struct file_system_type *fs_type, 2650static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2981,7 +2838,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2981 fput(vma->vm_file); 2838 fput(vma->vm_file);
2982 vma->vm_file = file; 2839 vma->vm_file = file;
2983 vma->vm_ops = &shmem_vm_ops; 2840 vma->vm_ops = &shmem_vm_ops;
2984 vma->vm_flags |= VM_CAN_NONLINEAR;
2985 return 0; 2841 return 0;
2986} 2842}
2987 2843
diff --git a/mm/slab.c b/mm/slab.c
index c6854759bcf1..33d3363658df 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
498 498
499#endif 499#endif
500 500
501#ifdef CONFIG_TRACING
502size_t slab_buffer_size(struct kmem_cache *cachep)
503{
504 return cachep->size;
505}
506EXPORT_SYMBOL(slab_buffer_size);
507#endif
508
509/* 501/*
510 * Do not go above this order unless 0 objects fit into the slab or 502 * Do not go above this order unless 0 objects fit into the slab or
511 * overridden on the command line. 503 * overridden on the command line.
@@ -515,13 +507,6 @@ EXPORT_SYMBOL(slab_buffer_size);
515static int slab_max_order = SLAB_MAX_ORDER_LO; 507static int slab_max_order = SLAB_MAX_ORDER_LO;
516static bool slab_max_order_set __initdata; 508static bool slab_max_order_set __initdata;
517 509
518static inline struct kmem_cache *page_get_cache(struct page *page)
519{
520 page = compound_head(page);
521 BUG_ON(!PageSlab(page));
522 return page->slab_cache;
523}
524
525static inline struct kmem_cache *virt_to_cache(const void *obj) 510static inline struct kmem_cache *virt_to_cache(const void *obj)
526{ 511{
527 struct page *page = virt_to_head_page(obj); 512 struct page *page = virt_to_head_page(obj);
@@ -585,9 +570,9 @@ static struct arraycache_init initarray_generic =
585 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 570 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
586 571
587/* internal cache of cache description objs */ 572/* internal cache of cache description objs */
588static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; 573static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
589static struct kmem_cache cache_cache = { 574static struct kmem_cache kmem_cache_boot = {
590 .nodelists = cache_cache_nodelists, 575 .nodelists = kmem_cache_nodelists,
591 .batchcount = 1, 576 .batchcount = 1,
592 .limit = BOOT_CPUCACHE_ENTRIES, 577 .limit = BOOT_CPUCACHE_ENTRIES,
593 .shared = 1, 578 .shared = 1,
@@ -810,6 +795,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
810 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 795 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
811} 796}
812 797
798#if DEBUG
813#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 799#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
814 800
815static void __slab_error(const char *function, struct kmem_cache *cachep, 801static void __slab_error(const char *function, struct kmem_cache *cachep,
@@ -818,7 +804,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
818 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 804 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
819 function, cachep->name, msg); 805 function, cachep->name, msg);
820 dump_stack(); 806 dump_stack();
807 add_taint(TAINT_BAD_PAGE);
821} 808}
809#endif
822 810
823/* 811/*
824 * By default on NUMA we use alien caches to stage the freeing of 812 * By default on NUMA we use alien caches to stage the freeing of
@@ -900,7 +888,7 @@ static void __cpuinit start_cpu_timer(int cpu)
900 */ 888 */
901 if (keventd_up() && reap_work->work.func == NULL) { 889 if (keventd_up() && reap_work->work.func == NULL) {
902 init_reap_node(cpu); 890 init_reap_node(cpu);
903 INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); 891 INIT_DEFERRABLE_WORK(reap_work, cache_reap);
904 schedule_delayed_work_on(cpu, reap_work, 892 schedule_delayed_work_on(cpu, reap_work,
905 __round_jiffies_relative(HZ, cpu)); 893 __round_jiffies_relative(HZ, cpu));
906 } 894 }
@@ -1601,15 +1589,17 @@ void __init kmem_cache_init(void)
1601 int order; 1589 int order;
1602 int node; 1590 int node;
1603 1591
1592 kmem_cache = &kmem_cache_boot;
1593
1604 if (num_possible_nodes() == 1) 1594 if (num_possible_nodes() == 1)
1605 use_alien_caches = 0; 1595 use_alien_caches = 0;
1606 1596
1607 for (i = 0; i < NUM_INIT_LISTS; i++) { 1597 for (i = 0; i < NUM_INIT_LISTS; i++) {
1608 kmem_list3_init(&initkmem_list3[i]); 1598 kmem_list3_init(&initkmem_list3[i]);
1609 if (i < MAX_NUMNODES) 1599 if (i < MAX_NUMNODES)
1610 cache_cache.nodelists[i] = NULL; 1600 kmem_cache->nodelists[i] = NULL;
1611 } 1601 }
1612 set_up_list3s(&cache_cache, CACHE_CACHE); 1602 set_up_list3s(kmem_cache, CACHE_CACHE);
1613 1603
1614 /* 1604 /*
1615 * Fragmentation resistance on low memory - only use bigger 1605 * Fragmentation resistance on low memory - only use bigger
@@ -1621,9 +1611,9 @@ void __init kmem_cache_init(void)
1621 1611
1622 /* Bootstrap is tricky, because several objects are allocated 1612 /* Bootstrap is tricky, because several objects are allocated
1623 * from caches that do not exist yet: 1613 * from caches that do not exist yet:
1624 * 1) initialize the cache_cache cache: it contains the struct 1614 * 1) initialize the kmem_cache cache: it contains the struct
1625 * kmem_cache structures of all caches, except cache_cache itself: 1615 * kmem_cache structures of all caches, except kmem_cache itself:
1626 * cache_cache is statically allocated. 1616 * kmem_cache is statically allocated.
1627 * Initially an __init data area is used for the head array and the 1617 * Initially an __init data area is used for the head array and the
1628 * kmem_list3 structures, it's replaced with a kmalloc allocated 1618 * kmem_list3 structures, it's replaced with a kmalloc allocated
1629 * array at the end of the bootstrap. 1619 * array at the end of the bootstrap.
@@ -1632,43 +1622,43 @@ void __init kmem_cache_init(void)
1632 * An __init data area is used for the head array. 1622 * An __init data area is used for the head array.
1633 * 3) Create the remaining kmalloc caches, with minimally sized 1623 * 3) Create the remaining kmalloc caches, with minimally sized
1634 * head arrays. 1624 * head arrays.
1635 * 4) Replace the __init data head arrays for cache_cache and the first 1625 * 4) Replace the __init data head arrays for kmem_cache and the first
1636 * kmalloc cache with kmalloc allocated arrays. 1626 * kmalloc cache with kmalloc allocated arrays.
1637 * 5) Replace the __init data for kmem_list3 for cache_cache and 1627 * 5) Replace the __init data for kmem_list3 for kmem_cache and
1638 * the other cache's with kmalloc allocated memory. 1628 * the other cache's with kmalloc allocated memory.
1639 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1629 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1640 */ 1630 */
1641 1631
1642 node = numa_mem_id(); 1632 node = numa_mem_id();
1643 1633
1644 /* 1) create the cache_cache */ 1634 /* 1) create the kmem_cache */
1645 INIT_LIST_HEAD(&slab_caches); 1635 INIT_LIST_HEAD(&slab_caches);
1646 list_add(&cache_cache.list, &slab_caches); 1636 list_add(&kmem_cache->list, &slab_caches);
1647 cache_cache.colour_off = cache_line_size(); 1637 kmem_cache->colour_off = cache_line_size();
1648 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1638 kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
1649 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1639 kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1650 1640
1651 /* 1641 /*
1652 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1642 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1653 */ 1643 */
1654 cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + 1644 kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1655 nr_node_ids * sizeof(struct kmem_list3 *); 1645 nr_node_ids * sizeof(struct kmem_list3 *);
1656 cache_cache.object_size = cache_cache.size; 1646 kmem_cache->object_size = kmem_cache->size;
1657 cache_cache.size = ALIGN(cache_cache.size, 1647 kmem_cache->size = ALIGN(kmem_cache->object_size,
1658 cache_line_size()); 1648 cache_line_size());
1659 cache_cache.reciprocal_buffer_size = 1649 kmem_cache->reciprocal_buffer_size =
1660 reciprocal_value(cache_cache.size); 1650 reciprocal_value(kmem_cache->size);
1661 1651
1662 for (order = 0; order < MAX_ORDER; order++) { 1652 for (order = 0; order < MAX_ORDER; order++) {
1663 cache_estimate(order, cache_cache.size, 1653 cache_estimate(order, kmem_cache->size,
1664 cache_line_size(), 0, &left_over, &cache_cache.num); 1654 cache_line_size(), 0, &left_over, &kmem_cache->num);
1665 if (cache_cache.num) 1655 if (kmem_cache->num)
1666 break; 1656 break;
1667 } 1657 }
1668 BUG_ON(!cache_cache.num); 1658 BUG_ON(!kmem_cache->num);
1669 cache_cache.gfporder = order; 1659 kmem_cache->gfporder = order;
1670 cache_cache.colour = left_over / cache_cache.colour_off; 1660 kmem_cache->colour = left_over / kmem_cache->colour_off;
1671 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1661 kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
1672 sizeof(struct slab), cache_line_size()); 1662 sizeof(struct slab), cache_line_size());
1673 1663
1674 /* 2+3) create the kmalloc caches */ 1664 /* 2+3) create the kmalloc caches */
@@ -1681,19 +1671,22 @@ void __init kmem_cache_init(void)
1681 * bug. 1671 * bug.
1682 */ 1672 */
1683 1673
1684 sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, 1674 sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1685 sizes[INDEX_AC].cs_size, 1675 sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
1686 ARCH_KMALLOC_MINALIGN, 1676 sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
1687 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1677 sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
1688 NULL); 1678 sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1679 __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1680 list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
1689 1681
1690 if (INDEX_AC != INDEX_L3) { 1682 if (INDEX_AC != INDEX_L3) {
1691 sizes[INDEX_L3].cs_cachep = 1683 sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1692 __kmem_cache_create(names[INDEX_L3].name, 1684 sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
1693 sizes[INDEX_L3].cs_size, 1685 sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
1694 ARCH_KMALLOC_MINALIGN, 1686 sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
1695 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1687 sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1696 NULL); 1688 __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1689 list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
1697 } 1690 }
1698 1691
1699 slab_early_init = 0; 1692 slab_early_init = 0;
@@ -1707,20 +1700,23 @@ void __init kmem_cache_init(void)
1707 * allow tighter packing of the smaller caches. 1700 * allow tighter packing of the smaller caches.
1708 */ 1701 */
1709 if (!sizes->cs_cachep) { 1702 if (!sizes->cs_cachep) {
1710 sizes->cs_cachep = __kmem_cache_create(names->name, 1703 sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1711 sizes->cs_size, 1704 sizes->cs_cachep->name = names->name;
1712 ARCH_KMALLOC_MINALIGN, 1705 sizes->cs_cachep->size = sizes->cs_size;
1713 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1706 sizes->cs_cachep->object_size = sizes->cs_size;
1714 NULL); 1707 sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
1708 __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
1709 list_add(&sizes->cs_cachep->list, &slab_caches);
1715 } 1710 }
1716#ifdef CONFIG_ZONE_DMA 1711#ifdef CONFIG_ZONE_DMA
1717 sizes->cs_dmacachep = __kmem_cache_create( 1712 sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
1718 names->name_dma, 1713 sizes->cs_dmacachep->name = names->name_dma;
1719 sizes->cs_size, 1714 sizes->cs_dmacachep->size = sizes->cs_size;
1720 ARCH_KMALLOC_MINALIGN, 1715 sizes->cs_dmacachep->object_size = sizes->cs_size;
1721 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| 1716 sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
1722 SLAB_PANIC, 1717 __kmem_cache_create(sizes->cs_dmacachep,
1723 NULL); 1718 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
1719 list_add(&sizes->cs_dmacachep->list, &slab_caches);
1724#endif 1720#endif
1725 sizes++; 1721 sizes++;
1726 names++; 1722 names++;
@@ -1731,15 +1727,15 @@ void __init kmem_cache_init(void)
1731 1727
1732 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1728 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1733 1729
1734 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1730 BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
1735 memcpy(ptr, cpu_cache_get(&cache_cache), 1731 memcpy(ptr, cpu_cache_get(kmem_cache),
1736 sizeof(struct arraycache_init)); 1732 sizeof(struct arraycache_init));
1737 /* 1733 /*
1738 * Do not assume that spinlocks can be initialized via memcpy: 1734 * Do not assume that spinlocks can be initialized via memcpy:
1739 */ 1735 */
1740 spin_lock_init(&ptr->lock); 1736 spin_lock_init(&ptr->lock);
1741 1737
1742 cache_cache.array[smp_processor_id()] = ptr; 1738 kmem_cache->array[smp_processor_id()] = ptr;
1743 1739
1744 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); 1740 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1745 1741
@@ -1760,7 +1756,7 @@ void __init kmem_cache_init(void)
1760 int nid; 1756 int nid;
1761 1757
1762 for_each_online_node(nid) { 1758 for_each_online_node(nid) {
1763 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); 1759 init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
1764 1760
1765 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1761 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1766 &initkmem_list3[SIZE_AC + nid], nid); 1762 &initkmem_list3[SIZE_AC + nid], nid);
@@ -1781,9 +1777,6 @@ void __init kmem_cache_init_late(void)
1781 1777
1782 slab_state = UP; 1778 slab_state = UP;
1783 1779
1784 /* Annotate slab for lockdep -- annotate the malloc caches */
1785 init_lock_keys();
1786
1787 /* 6) resize the head arrays to their final sizes */ 1780 /* 6) resize the head arrays to their final sizes */
1788 mutex_lock(&slab_mutex); 1781 mutex_lock(&slab_mutex);
1789 list_for_each_entry(cachep, &slab_caches, list) 1782 list_for_each_entry(cachep, &slab_caches, list)
@@ -1791,6 +1784,9 @@ void __init kmem_cache_init_late(void)
1791 BUG(); 1784 BUG();
1792 mutex_unlock(&slab_mutex); 1785 mutex_unlock(&slab_mutex);
1793 1786
1787 /* Annotate slab for lockdep -- annotate the malloc caches */
1788 init_lock_keys();
1789
1794 /* Done! */ 1790 /* Done! */
1795 slab_state = FULL; 1791 slab_state = FULL;
1796 1792
@@ -2209,27 +2205,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
2209 } 2205 }
2210} 2206}
2211 2207
2212static void __kmem_cache_destroy(struct kmem_cache *cachep)
2213{
2214 int i;
2215 struct kmem_list3 *l3;
2216
2217 for_each_online_cpu(i)
2218 kfree(cachep->array[i]);
2219
2220 /* NUMA: free the list3 structures */
2221 for_each_online_node(i) {
2222 l3 = cachep->nodelists[i];
2223 if (l3) {
2224 kfree(l3->shared);
2225 free_alien_cache(l3->alien);
2226 kfree(l3);
2227 }
2228 }
2229 kmem_cache_free(&cache_cache, cachep);
2230}
2231
2232
2233/** 2208/**
2234 * calculate_slab_order - calculate size (page order) of slabs 2209 * calculate_slab_order - calculate size (page order) of slabs
2235 * @cachep: pointer to the cache that is being created 2210 * @cachep: pointer to the cache that is being created
@@ -2366,9 +2341,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2366 * Cannot be called within a int, but can be interrupted. 2341 * Cannot be called within a int, but can be interrupted.
2367 * The @ctor is run when new pages are allocated by the cache. 2342 * The @ctor is run when new pages are allocated by the cache.
2368 * 2343 *
2369 * @name must be valid until the cache is destroyed. This implies that
2370 * the module calling this has to destroy the cache before getting unloaded.
2371 *
2372 * The flags are 2344 * The flags are
2373 * 2345 *
2374 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2346 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -2381,13 +2353,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2381 * cacheline. This can be beneficial if you're counting cycles as closely 2353 * cacheline. This can be beneficial if you're counting cycles as closely
2382 * as davem. 2354 * as davem.
2383 */ 2355 */
2384struct kmem_cache * 2356int
2385__kmem_cache_create (const char *name, size_t size, size_t align, 2357__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2386 unsigned long flags, void (*ctor)(void *))
2387{ 2358{
2388 size_t left_over, slab_size, ralign; 2359 size_t left_over, slab_size, ralign;
2389 struct kmem_cache *cachep = NULL;
2390 gfp_t gfp; 2360 gfp_t gfp;
2361 int err;
2362 size_t size = cachep->size;
2391 2363
2392#if DEBUG 2364#if DEBUG
2393#if FORCED_DEBUG 2365#if FORCED_DEBUG
@@ -2459,8 +2431,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
2459 ralign = ARCH_SLAB_MINALIGN; 2431 ralign = ARCH_SLAB_MINALIGN;
2460 } 2432 }
2461 /* 3) caller mandated alignment */ 2433 /* 3) caller mandated alignment */
2462 if (ralign < align) { 2434 if (ralign < cachep->align) {
2463 ralign = align; 2435 ralign = cachep->align;
2464 } 2436 }
2465 /* disable debug if necessary */ 2437 /* disable debug if necessary */
2466 if (ralign > __alignof__(unsigned long long)) 2438 if (ralign > __alignof__(unsigned long long))
@@ -2468,21 +2440,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
2468 /* 2440 /*
2469 * 4) Store it. 2441 * 4) Store it.
2470 */ 2442 */
2471 align = ralign; 2443 cachep->align = ralign;
2472 2444
2473 if (slab_is_available()) 2445 if (slab_is_available())
2474 gfp = GFP_KERNEL; 2446 gfp = GFP_KERNEL;
2475 else 2447 else
2476 gfp = GFP_NOWAIT; 2448 gfp = GFP_NOWAIT;
2477 2449
2478 /* Get cache's description obj. */
2479 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2480 if (!cachep)
2481 return NULL;
2482
2483 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; 2450 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2484 cachep->object_size = size;
2485 cachep->align = align;
2486#if DEBUG 2451#if DEBUG
2487 2452
2488 /* 2453 /*
@@ -2506,8 +2471,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
2506 } 2471 }
2507#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2472#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2508 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2473 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2509 && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { 2474 && cachep->object_size > cache_line_size()
2510 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); 2475 && ALIGN(size, cachep->align) < PAGE_SIZE) {
2476 cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
2511 size = PAGE_SIZE; 2477 size = PAGE_SIZE;
2512 } 2478 }
2513#endif 2479#endif
@@ -2527,18 +2493,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
2527 */ 2493 */
2528 flags |= CFLGS_OFF_SLAB; 2494 flags |= CFLGS_OFF_SLAB;
2529 2495
2530 size = ALIGN(size, align); 2496 size = ALIGN(size, cachep->align);
2531 2497
2532 left_over = calculate_slab_order(cachep, size, align, flags); 2498 left_over = calculate_slab_order(cachep, size, cachep->align, flags);
2499
2500 if (!cachep->num)
2501 return -E2BIG;
2533 2502
2534 if (!cachep->num) {
2535 printk(KERN_ERR
2536 "kmem_cache_create: couldn't create cache %s.\n", name);
2537 kmem_cache_free(&cache_cache, cachep);
2538 return NULL;
2539 }
2540 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2503 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2541 + sizeof(struct slab), align); 2504 + sizeof(struct slab), cachep->align);
2542 2505
2543 /* 2506 /*
2544 * If the slab has been placed off-slab, and we have enough space then 2507 * If the slab has been placed off-slab, and we have enough space then
@@ -2566,8 +2529,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
2566 2529
2567 cachep->colour_off = cache_line_size(); 2530 cachep->colour_off = cache_line_size();
2568 /* Offset must be a multiple of the alignment. */ 2531 /* Offset must be a multiple of the alignment. */
2569 if (cachep->colour_off < align) 2532 if (cachep->colour_off < cachep->align)
2570 cachep->colour_off = align; 2533 cachep->colour_off = cachep->align;
2571 cachep->colour = left_over / cachep->colour_off; 2534 cachep->colour = left_over / cachep->colour_off;
2572 cachep->slab_size = slab_size; 2535 cachep->slab_size = slab_size;
2573 cachep->flags = flags; 2536 cachep->flags = flags;
@@ -2588,12 +2551,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
2588 */ 2551 */
2589 BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); 2552 BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
2590 } 2553 }
2591 cachep->ctor = ctor;
2592 cachep->name = name;
2593 2554
2594 if (setup_cpu_cache(cachep, gfp)) { 2555 err = setup_cpu_cache(cachep, gfp);
2595 __kmem_cache_destroy(cachep); 2556 if (err) {
2596 return NULL; 2557 __kmem_cache_shutdown(cachep);
2558 return err;
2597 } 2559 }
2598 2560
2599 if (flags & SLAB_DEBUG_OBJECTS) { 2561 if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2606,9 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
2606 slab_set_debugobj_lock_classes(cachep); 2568 slab_set_debugobj_lock_classes(cachep);
2607 } 2569 }
2608 2570
2609 /* cache setup completed, link it into the list */ 2571 return 0;
2610 list_add(&cachep->list, &slab_caches);
2611 return cachep;
2612} 2572}
2613 2573
2614#if DEBUG 2574#if DEBUG
@@ -2767,49 +2727,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
2767} 2727}
2768EXPORT_SYMBOL(kmem_cache_shrink); 2728EXPORT_SYMBOL(kmem_cache_shrink);
2769 2729
2770/** 2730int __kmem_cache_shutdown(struct kmem_cache *cachep)
2771 * kmem_cache_destroy - delete a cache
2772 * @cachep: the cache to destroy
2773 *
2774 * Remove a &struct kmem_cache object from the slab cache.
2775 *
2776 * It is expected this function will be called by a module when it is
2777 * unloaded. This will remove the cache completely, and avoid a duplicate
2778 * cache being allocated each time a module is loaded and unloaded, if the
2779 * module doesn't have persistent in-kernel storage across loads and unloads.
2780 *
2781 * The cache must be empty before calling this function.
2782 *
2783 * The caller must guarantee that no one will allocate memory from the cache
2784 * during the kmem_cache_destroy().
2785 */
2786void kmem_cache_destroy(struct kmem_cache *cachep)
2787{ 2731{
2788 BUG_ON(!cachep || in_interrupt()); 2732 int i;
2733 struct kmem_list3 *l3;
2734 int rc = __cache_shrink(cachep);
2789 2735
2790 /* Find the cache in the chain of caches. */ 2736 if (rc)
2791 get_online_cpus(); 2737 return rc;
2792 mutex_lock(&slab_mutex);
2793 /*
2794 * the chain is never empty, cache_cache is never destroyed
2795 */
2796 list_del(&cachep->list);
2797 if (__cache_shrink(cachep)) {
2798 slab_error(cachep, "Can't free all objects");
2799 list_add(&cachep->list, &slab_caches);
2800 mutex_unlock(&slab_mutex);
2801 put_online_cpus();
2802 return;
2803 }
2804 2738
2805 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2739 for_each_online_cpu(i)
2806 rcu_barrier(); 2740 kfree(cachep->array[i]);
2807 2741
2808 __kmem_cache_destroy(cachep); 2742 /* NUMA: free the list3 structures */
2809 mutex_unlock(&slab_mutex); 2743 for_each_online_node(i) {
2810 put_online_cpus(); 2744 l3 = cachep->nodelists[i];
2745 if (l3) {
2746 kfree(l3->shared);
2747 free_alien_cache(l3->alien);
2748 kfree(l3);
2749 }
2750 }
2751 return 0;
2811} 2752}
2812EXPORT_SYMBOL(kmem_cache_destroy);
2813 2753
2814/* 2754/*
2815 * Get the memory for a slab management obj. 2755 * Get the memory for a slab management obj.
@@ -3098,7 +3038,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
3098} 3038}
3099 3039
3100static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 3040static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3101 void *caller) 3041 unsigned long caller)
3102{ 3042{
3103 struct page *page; 3043 struct page *page;
3104 unsigned int objnr; 3044 unsigned int objnr;
@@ -3118,7 +3058,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3118 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 3058 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
3119 } 3059 }
3120 if (cachep->flags & SLAB_STORE_USER) 3060 if (cachep->flags & SLAB_STORE_USER)
3121 *dbg_userword(cachep, objp) = caller; 3061 *dbg_userword(cachep, objp) = (void *)caller;
3122 3062
3123 objnr = obj_to_index(cachep, slabp, objp); 3063 objnr = obj_to_index(cachep, slabp, objp);
3124 3064
@@ -3131,7 +3071,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
3131 if (cachep->flags & SLAB_POISON) { 3071 if (cachep->flags & SLAB_POISON) {
3132#ifdef CONFIG_DEBUG_PAGEALLOC 3072#ifdef CONFIG_DEBUG_PAGEALLOC
3133 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 3073 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
3134 store_stackinfo(cachep, objp, (unsigned long)caller); 3074 store_stackinfo(cachep, objp, caller);
3135 kernel_map_pages(virt_to_page(objp), 3075 kernel_map_pages(virt_to_page(objp),
3136 cachep->size / PAGE_SIZE, 0); 3076 cachep->size / PAGE_SIZE, 0);
3137 } else { 3077 } else {
@@ -3285,7 +3225,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3285 3225
3286#if DEBUG 3226#if DEBUG
3287static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3227static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3288 gfp_t flags, void *objp, void *caller) 3228 gfp_t flags, void *objp, unsigned long caller)
3289{ 3229{
3290 if (!objp) 3230 if (!objp)
3291 return objp; 3231 return objp;
@@ -3302,7 +3242,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3302 poison_obj(cachep, objp, POISON_INUSE); 3242 poison_obj(cachep, objp, POISON_INUSE);
3303 } 3243 }
3304 if (cachep->flags & SLAB_STORE_USER) 3244 if (cachep->flags & SLAB_STORE_USER)
3305 *dbg_userword(cachep, objp) = caller; 3245 *dbg_userword(cachep, objp) = (void *)caller;
3306 3246
3307 if (cachep->flags & SLAB_RED_ZONE) { 3247 if (cachep->flags & SLAB_RED_ZONE) {
3308 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 3248 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
@@ -3343,7 +3283,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3343 3283
3344static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 3284static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3345{ 3285{
3346 if (cachep == &cache_cache) 3286 if (cachep == kmem_cache)
3347 return false; 3287 return false;
3348 3288
3349 return should_failslab(cachep->object_size, flags, cachep->flags); 3289 return should_failslab(cachep->object_size, flags, cachep->flags);
@@ -3576,8 +3516,8 @@ done:
3576 * Fallback to other node is possible if __GFP_THISNODE is not set. 3516 * Fallback to other node is possible if __GFP_THISNODE is not set.
3577 */ 3517 */
3578static __always_inline void * 3518static __always_inline void *
3579__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3519slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3580 void *caller) 3520 unsigned long caller)
3581{ 3521{
3582 unsigned long save_flags; 3522 unsigned long save_flags;
3583 void *ptr; 3523 void *ptr;
@@ -3663,7 +3603,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3663#endif /* CONFIG_NUMA */ 3603#endif /* CONFIG_NUMA */
3664 3604
3665static __always_inline void * 3605static __always_inline void *
3666__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) 3606slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3667{ 3607{
3668 unsigned long save_flags; 3608 unsigned long save_flags;
3669 void *objp; 3609 void *objp;
@@ -3799,7 +3739,7 @@ free_done:
3799 * be in this state _before_ it is released. Called with disabled ints. 3739 * be in this state _before_ it is released. Called with disabled ints.
3800 */ 3740 */
3801static inline void __cache_free(struct kmem_cache *cachep, void *objp, 3741static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3802 void *caller) 3742 unsigned long caller)
3803{ 3743{
3804 struct array_cache *ac = cpu_cache_get(cachep); 3744 struct array_cache *ac = cpu_cache_get(cachep);
3805 3745
@@ -3839,7 +3779,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3839 */ 3779 */
3840void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3780void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3841{ 3781{
3842 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); 3782 void *ret = slab_alloc(cachep, flags, _RET_IP_);
3843 3783
3844 trace_kmem_cache_alloc(_RET_IP_, ret, 3784 trace_kmem_cache_alloc(_RET_IP_, ret,
3845 cachep->object_size, cachep->size, flags); 3785 cachep->object_size, cachep->size, flags);
@@ -3850,14 +3790,14 @@ EXPORT_SYMBOL(kmem_cache_alloc);
3850 3790
3851#ifdef CONFIG_TRACING 3791#ifdef CONFIG_TRACING
3852void * 3792void *
3853kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) 3793kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
3854{ 3794{
3855 void *ret; 3795 void *ret;
3856 3796
3857 ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); 3797 ret = slab_alloc(cachep, flags, _RET_IP_);
3858 3798
3859 trace_kmalloc(_RET_IP_, ret, 3799 trace_kmalloc(_RET_IP_, ret,
3860 size, slab_buffer_size(cachep), flags); 3800 size, cachep->size, flags);
3861 return ret; 3801 return ret;
3862} 3802}
3863EXPORT_SYMBOL(kmem_cache_alloc_trace); 3803EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -3866,8 +3806,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
3866#ifdef CONFIG_NUMA 3806#ifdef CONFIG_NUMA
3867void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3807void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3868{ 3808{
3869 void *ret = __cache_alloc_node(cachep, flags, nodeid, 3809 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3870 __builtin_return_address(0));
3871 3810
3872 trace_kmem_cache_alloc_node(_RET_IP_, ret, 3811 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3873 cachep->object_size, cachep->size, 3812 cachep->object_size, cachep->size,
@@ -3878,17 +3817,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3878EXPORT_SYMBOL(kmem_cache_alloc_node); 3817EXPORT_SYMBOL(kmem_cache_alloc_node);
3879 3818
3880#ifdef CONFIG_TRACING 3819#ifdef CONFIG_TRACING
3881void *kmem_cache_alloc_node_trace(size_t size, 3820void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
3882 struct kmem_cache *cachep,
3883 gfp_t flags, 3821 gfp_t flags,
3884 int nodeid) 3822 int nodeid,
3823 size_t size)
3885{ 3824{
3886 void *ret; 3825 void *ret;
3887 3826
3888 ret = __cache_alloc_node(cachep, flags, nodeid, 3827 ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
3889 __builtin_return_address(0)); 3828
3890 trace_kmalloc_node(_RET_IP_, ret, 3829 trace_kmalloc_node(_RET_IP_, ret,
3891 size, slab_buffer_size(cachep), 3830 size, cachep->size,
3892 flags, nodeid); 3831 flags, nodeid);
3893 return ret; 3832 return ret;
3894} 3833}
@@ -3896,34 +3835,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3896#endif 3835#endif
3897 3836
3898static __always_inline void * 3837static __always_inline void *
3899__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3838__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
3900{ 3839{
3901 struct kmem_cache *cachep; 3840 struct kmem_cache *cachep;
3902 3841
3903 cachep = kmem_find_general_cachep(size, flags); 3842 cachep = kmem_find_general_cachep(size, flags);
3904 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3843 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3905 return cachep; 3844 return cachep;
3906 return kmem_cache_alloc_node_trace(size, cachep, flags, node); 3845 return kmem_cache_alloc_node_trace(cachep, flags, node, size);
3907} 3846}
3908 3847
3909#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3848#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3910void *__kmalloc_node(size_t size, gfp_t flags, int node) 3849void *__kmalloc_node(size_t size, gfp_t flags, int node)
3911{ 3850{
3912 return __do_kmalloc_node(size, flags, node, 3851 return __do_kmalloc_node(size, flags, node, _RET_IP_);
3913 __builtin_return_address(0));
3914} 3852}
3915EXPORT_SYMBOL(__kmalloc_node); 3853EXPORT_SYMBOL(__kmalloc_node);
3916 3854
3917void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3855void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3918 int node, unsigned long caller) 3856 int node, unsigned long caller)
3919{ 3857{
3920 return __do_kmalloc_node(size, flags, node, (void *)caller); 3858 return __do_kmalloc_node(size, flags, node, caller);
3921} 3859}
3922EXPORT_SYMBOL(__kmalloc_node_track_caller); 3860EXPORT_SYMBOL(__kmalloc_node_track_caller);
3923#else 3861#else
3924void *__kmalloc_node(size_t size, gfp_t flags, int node) 3862void *__kmalloc_node(size_t size, gfp_t flags, int node)
3925{ 3863{
3926 return __do_kmalloc_node(size, flags, node, NULL); 3864 return __do_kmalloc_node(size, flags, node, 0);
3927} 3865}
3928EXPORT_SYMBOL(__kmalloc_node); 3866EXPORT_SYMBOL(__kmalloc_node);
3929#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ 3867#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
@@ -3936,7 +3874,7 @@ EXPORT_SYMBOL(__kmalloc_node);
3936 * @caller: function caller for debug tracking of the caller 3874 * @caller: function caller for debug tracking of the caller
3937 */ 3875 */
3938static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3876static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3939 void *caller) 3877 unsigned long caller)
3940{ 3878{
3941 struct kmem_cache *cachep; 3879 struct kmem_cache *cachep;
3942 void *ret; 3880 void *ret;
@@ -3949,9 +3887,9 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3949 cachep = __find_general_cachep(size, flags); 3887 cachep = __find_general_cachep(size, flags);
3950 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3888 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3951 return cachep; 3889 return cachep;
3952 ret = __cache_alloc(cachep, flags, caller); 3890 ret = slab_alloc(cachep, flags, caller);
3953 3891
3954 trace_kmalloc((unsigned long) caller, ret, 3892 trace_kmalloc(caller, ret,
3955 size, cachep->size, flags); 3893 size, cachep->size, flags);
3956 3894
3957 return ret; 3895 return ret;
@@ -3961,20 +3899,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3961#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3899#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3962void *__kmalloc(size_t size, gfp_t flags) 3900void *__kmalloc(size_t size, gfp_t flags)
3963{ 3901{
3964 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3902 return __do_kmalloc(size, flags, _RET_IP_);
3965} 3903}
3966EXPORT_SYMBOL(__kmalloc); 3904EXPORT_SYMBOL(__kmalloc);
3967 3905
3968void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) 3906void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3969{ 3907{
3970 return __do_kmalloc(size, flags, (void *)caller); 3908 return __do_kmalloc(size, flags, caller);
3971} 3909}
3972EXPORT_SYMBOL(__kmalloc_track_caller); 3910EXPORT_SYMBOL(__kmalloc_track_caller);
3973 3911
3974#else 3912#else
3975void *__kmalloc(size_t size, gfp_t flags) 3913void *__kmalloc(size_t size, gfp_t flags)
3976{ 3914{
3977 return __do_kmalloc(size, flags, NULL); 3915 return __do_kmalloc(size, flags, 0);
3978} 3916}
3979EXPORT_SYMBOL(__kmalloc); 3917EXPORT_SYMBOL(__kmalloc);
3980#endif 3918#endif
@@ -3995,7 +3933,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3995 debug_check_no_locks_freed(objp, cachep->object_size); 3933 debug_check_no_locks_freed(objp, cachep->object_size);
3996 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3934 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3997 debug_check_no_obj_freed(objp, cachep->object_size); 3935 debug_check_no_obj_freed(objp, cachep->object_size);
3998 __cache_free(cachep, objp, __builtin_return_address(0)); 3936 __cache_free(cachep, objp, _RET_IP_);
3999 local_irq_restore(flags); 3937 local_irq_restore(flags);
4000 3938
4001 trace_kmem_cache_free(_RET_IP_, objp); 3939 trace_kmem_cache_free(_RET_IP_, objp);
@@ -4026,7 +3964,7 @@ void kfree(const void *objp)
4026 debug_check_no_locks_freed(objp, c->object_size); 3964 debug_check_no_locks_freed(objp, c->object_size);
4027 3965
4028 debug_check_no_obj_freed(objp, c->object_size); 3966 debug_check_no_obj_freed(objp, c->object_size);
4029 __cache_free(c, (void *)objp, __builtin_return_address(0)); 3967 __cache_free(c, (void *)objp, _RET_IP_);
4030 local_irq_restore(flags); 3968 local_irq_restore(flags);
4031} 3969}
4032EXPORT_SYMBOL(kfree); 3970EXPORT_SYMBOL(kfree);
diff --git a/mm/slab.h b/mm/slab.h
index db7848caaa25..7deeb449a301 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -25,9 +25,26 @@ extern enum slab_state slab_state;
25 25
26/* The slab cache mutex protects the management structures during changes */ 26/* The slab cache mutex protects the management structures during changes */
27extern struct mutex slab_mutex; 27extern struct mutex slab_mutex;
28
29/* The list of all slab caches on the system */
28extern struct list_head slab_caches; 30extern struct list_head slab_caches;
29 31
30struct kmem_cache *__kmem_cache_create(const char *name, size_t size, 32/* The slab cache that manages slab cache information */
33extern struct kmem_cache *kmem_cache;
34
35/* Functions provided by the slab allocators */
36extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
37
38#ifdef CONFIG_SLUB
39struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
31 size_t align, unsigned long flags, void (*ctor)(void *)); 40 size_t align, unsigned long flags, void (*ctor)(void *));
41#else
42static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
43 size_t align, unsigned long flags, void (*ctor)(void *))
44{ return NULL; }
45#endif
46
47
48int __kmem_cache_shutdown(struct kmem_cache *);
32 49
33#endif 50#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index aa3ca5bb01b5..069a24e64403 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -22,6 +22,53 @@
22enum slab_state slab_state; 22enum slab_state slab_state;
23LIST_HEAD(slab_caches); 23LIST_HEAD(slab_caches);
24DEFINE_MUTEX(slab_mutex); 24DEFINE_MUTEX(slab_mutex);
25struct kmem_cache *kmem_cache;
26
27#ifdef CONFIG_DEBUG_VM
28static int kmem_cache_sanity_check(const char *name, size_t size)
29{
30 struct kmem_cache *s = NULL;
31
32 if (!name || in_interrupt() || size < sizeof(void *) ||
33 size > KMALLOC_MAX_SIZE) {
34 pr_err("kmem_cache_create(%s) integrity check failed\n", name);
35 return -EINVAL;
36 }
37
38 list_for_each_entry(s, &slab_caches, list) {
39 char tmp;
40 int res;
41
42 /*
43 * This happens when the module gets unloaded and doesn't
44 * destroy its slab cache and no-one else reuses the vmalloc
45 * area of the module. Print a warning.
46 */
47 res = probe_kernel_address(s->name, tmp);
48 if (res) {
49 pr_err("Slab cache with size %d has lost its name\n",
50 s->object_size);
51 continue;
52 }
53
54 if (!strcmp(s->name, name)) {
55 pr_err("%s (%s): Cache name already exists.\n",
56 __func__, name);
57 dump_stack();
58 s = NULL;
59 return -EINVAL;
60 }
61 }
62
63 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
64 return 0;
65}
66#else
67static inline int kmem_cache_sanity_check(const char *name, size_t size)
68{
69 return 0;
70}
71#endif
25 72
26/* 73/*
27 * kmem_cache_create - Create a cache. 74 * kmem_cache_create - Create a cache.
@@ -52,68 +99,95 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
52 unsigned long flags, void (*ctor)(void *)) 99 unsigned long flags, void (*ctor)(void *))
53{ 100{
54 struct kmem_cache *s = NULL; 101 struct kmem_cache *s = NULL;
55 102 int err = 0;
56#ifdef CONFIG_DEBUG_VM
57 if (!name || in_interrupt() || size < sizeof(void *) ||
58 size > KMALLOC_MAX_SIZE) {
59 printk(KERN_ERR "kmem_cache_create(%s) integrity check"
60 " failed\n", name);
61 goto out;
62 }
63#endif
64 103
65 get_online_cpus(); 104 get_online_cpus();
66 mutex_lock(&slab_mutex); 105 mutex_lock(&slab_mutex);
67 106
68#ifdef CONFIG_DEBUG_VM 107 if (!kmem_cache_sanity_check(name, size) == 0)
69 list_for_each_entry(s, &slab_caches, list) { 108 goto out_locked;
70 char tmp;
71 int res;
72 109
73 /*
74 * This happens when the module gets unloaded and doesn't
75 * destroy its slab cache and no-one else reuses the vmalloc
76 * area of the module. Print a warning.
77 */
78 res = probe_kernel_address(s->name, tmp);
79 if (res) {
80 printk(KERN_ERR
81 "Slab cache with size %d has lost its name\n",
82 s->object_size);
83 continue;
84 }
85 110
86 if (!strcmp(s->name, name)) { 111 s = __kmem_cache_alias(name, size, align, flags, ctor);
87 printk(KERN_ERR "kmem_cache_create(%s): Cache name" 112 if (s)
88 " already exists.\n", 113 goto out_locked;
89 name); 114
90 dump_stack(); 115 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
91 s = NULL; 116 if (s) {
92 goto oops; 117 s->object_size = s->size = size;
118 s->align = align;
119 s->ctor = ctor;
120 s->name = kstrdup(name, GFP_KERNEL);
121 if (!s->name) {
122 kmem_cache_free(kmem_cache, s);
123 err = -ENOMEM;
124 goto out_locked;
93 } 125 }
94 }
95 126
96 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 127 err = __kmem_cache_create(s, flags);
97#endif 128 if (!err) {
98 129
99 s = __kmem_cache_create(name, size, align, flags, ctor); 130 s->refcount = 1;
131 list_add(&s->list, &slab_caches);
100 132
101#ifdef CONFIG_DEBUG_VM 133 } else {
102oops: 134 kfree(s->name);
103#endif 135 kmem_cache_free(kmem_cache, s);
136 }
137 } else
138 err = -ENOMEM;
139
140out_locked:
104 mutex_unlock(&slab_mutex); 141 mutex_unlock(&slab_mutex);
105 put_online_cpus(); 142 put_online_cpus();
106 143
107#ifdef CONFIG_DEBUG_VM 144 if (err) {
108out: 145
109#endif 146 if (flags & SLAB_PANIC)
110 if (!s && (flags & SLAB_PANIC)) 147 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
111 panic("kmem_cache_create: Failed to create slab '%s'\n", name); 148 name, err);
149 else {
150 printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
151 name, err);
152 dump_stack();
153 }
154
155 return NULL;
156 }
112 157
113 return s; 158 return s;
114} 159}
115EXPORT_SYMBOL(kmem_cache_create); 160EXPORT_SYMBOL(kmem_cache_create);
116 161
162void kmem_cache_destroy(struct kmem_cache *s)
163{
164 get_online_cpus();
165 mutex_lock(&slab_mutex);
166 s->refcount--;
167 if (!s->refcount) {
168 list_del(&s->list);
169
170 if (!__kmem_cache_shutdown(s)) {
171 mutex_unlock(&slab_mutex);
172 if (s->flags & SLAB_DESTROY_BY_RCU)
173 rcu_barrier();
174
175 kfree(s->name);
176 kmem_cache_free(kmem_cache, s);
177 } else {
178 list_add(&s->list, &slab_caches);
179 mutex_unlock(&slab_mutex);
180 printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
181 s->name);
182 dump_stack();
183 }
184 } else {
185 mutex_unlock(&slab_mutex);
186 }
187 put_online_cpus();
188}
189EXPORT_SYMBOL(kmem_cache_destroy);
190
117int slab_is_available(void) 191int slab_is_available(void)
118{ 192{
119 return slab_state >= UP; 193 return slab_state >= UP;
diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933a..1e921c5e9576 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
194 void *page; 194 void *page;
195 195
196#ifdef CONFIG_NUMA 196#ifdef CONFIG_NUMA
197 if (node != -1) 197 if (node != NUMA_NO_NODE)
198 page = alloc_pages_exact_node(node, gfp, order); 198 page = alloc_pages_exact_node(node, gfp, order);
199 else 199 else
200#endif 200#endif
@@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
290 * If there's a node specification, search for a partial 290 * If there's a node specification, search for a partial
291 * page with a matching node id in the freelist. 291 * page with a matching node id in the freelist.
292 */ 292 */
293 if (node != -1 && page_to_nid(sp) != node) 293 if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
294 continue; 294 continue;
295#endif 295#endif
296 /* Enough room on this page? */ 296 /* Enough room on this page? */
@@ -425,10 +425,11 @@ out:
425 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. 425 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
426 */ 426 */
427 427
428void *__kmalloc_node(size_t size, gfp_t gfp, int node) 428static __always_inline void *
429__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
429{ 430{
430 unsigned int *m; 431 unsigned int *m;
431 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 432 int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
432 void *ret; 433 void *ret;
433 434
434 gfp &= gfp_allowed_mask; 435 gfp &= gfp_allowed_mask;
@@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
446 *m = size; 447 *m = size;
447 ret = (void *)m + align; 448 ret = (void *)m + align;
448 449
449 trace_kmalloc_node(_RET_IP_, ret, 450 trace_kmalloc_node(caller, ret,
450 size, size + align, gfp, node); 451 size, size + align, gfp, node);
451 } else { 452 } else {
452 unsigned int order = get_order(size); 453 unsigned int order = get_order(size);
@@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
460 page->private = size; 461 page->private = size;
461 } 462 }
462 463
463 trace_kmalloc_node(_RET_IP_, ret, 464 trace_kmalloc_node(caller, ret,
464 size, PAGE_SIZE << order, gfp, node); 465 size, PAGE_SIZE << order, gfp, node);
465 } 466 }
466 467
467 kmemleak_alloc(ret, size, 1, gfp); 468 kmemleak_alloc(ret, size, 1, gfp);
468 return ret; 469 return ret;
469} 470}
471
472void *__kmalloc_node(size_t size, gfp_t gfp, int node)
473{
474 return __do_kmalloc_node(size, gfp, node, _RET_IP_);
475}
470EXPORT_SYMBOL(__kmalloc_node); 476EXPORT_SYMBOL(__kmalloc_node);
471 477
478#ifdef CONFIG_TRACING
479void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
480{
481 return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
482}
483
484#ifdef CONFIG_NUMA
485void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
486 int node, unsigned long caller)
487{
488 return __do_kmalloc_node(size, gfp, node, caller);
489}
490#endif
491#endif
492
472void kfree(const void *block) 493void kfree(const void *block)
473{ 494{
474 struct page *sp; 495 struct page *sp;
@@ -481,7 +502,7 @@ void kfree(const void *block)
481 502
482 sp = virt_to_page(block); 503 sp = virt_to_page(block);
483 if (PageSlab(sp)) { 504 if (PageSlab(sp)) {
484 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 505 int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
485 unsigned int *m = (unsigned int *)(block - align); 506 unsigned int *m = (unsigned int *)(block - align);
486 slob_free(m, *m + align); 507 slob_free(m, *m + align);
487 } else 508 } else
@@ -500,7 +521,7 @@ size_t ksize(const void *block)
500 521
501 sp = virt_to_page(block); 522 sp = virt_to_page(block);
502 if (PageSlab(sp)) { 523 if (PageSlab(sp)) {
503 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 524 int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
504 unsigned int *m = (unsigned int *)(block - align); 525 unsigned int *m = (unsigned int *)(block - align);
505 return SLOB_UNITS(*m) * SLOB_UNIT; 526 return SLOB_UNITS(*m) * SLOB_UNIT;
506 } else 527 } else
@@ -508,44 +529,24 @@ size_t ksize(const void *block)
508} 529}
509EXPORT_SYMBOL(ksize); 530EXPORT_SYMBOL(ksize);
510 531
511struct kmem_cache *__kmem_cache_create(const char *name, size_t size, 532int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
512 size_t align, unsigned long flags, void (*ctor)(void *))
513{ 533{
514 struct kmem_cache *c; 534 size_t align = c->size;
515
516 c = slob_alloc(sizeof(struct kmem_cache),
517 GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
518 535
519 if (c) { 536 if (flags & SLAB_DESTROY_BY_RCU) {
520 c->name = name; 537 /* leave room for rcu footer at the end of object */
521 c->size = size; 538 c->size += sizeof(struct slob_rcu);
522 if (flags & SLAB_DESTROY_BY_RCU) {
523 /* leave room for rcu footer at the end of object */
524 c->size += sizeof(struct slob_rcu);
525 }
526 c->flags = flags;
527 c->ctor = ctor;
528 /* ignore alignment unless it's forced */
529 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
530 if (c->align < ARCH_SLAB_MINALIGN)
531 c->align = ARCH_SLAB_MINALIGN;
532 if (c->align < align)
533 c->align = align;
534
535 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
536 c->refcount = 1;
537 } 539 }
538 return c; 540 c->flags = flags;
539} 541 /* ignore alignment unless it's forced */
542 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
543 if (c->align < ARCH_SLAB_MINALIGN)
544 c->align = ARCH_SLAB_MINALIGN;
545 if (c->align < align)
546 c->align = align;
540 547
541void kmem_cache_destroy(struct kmem_cache *c) 548 return 0;
542{
543 kmemleak_free(c);
544 if (c->flags & SLAB_DESTROY_BY_RCU)
545 rcu_barrier();
546 slob_free(c, sizeof(struct kmem_cache));
547} 549}
548EXPORT_SYMBOL(kmem_cache_destroy);
549 550
550void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) 551void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
551{ 552{
@@ -613,14 +614,28 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
613} 614}
614EXPORT_SYMBOL(kmem_cache_size); 615EXPORT_SYMBOL(kmem_cache_size);
615 616
617int __kmem_cache_shutdown(struct kmem_cache *c)
618{
619 /* No way to check for remaining objects */
620 return 0;
621}
622
616int kmem_cache_shrink(struct kmem_cache *d) 623int kmem_cache_shrink(struct kmem_cache *d)
617{ 624{
618 return 0; 625 return 0;
619} 626}
620EXPORT_SYMBOL(kmem_cache_shrink); 627EXPORT_SYMBOL(kmem_cache_shrink);
621 628
629struct kmem_cache kmem_cache_boot = {
630 .name = "kmem_cache",
631 .size = sizeof(struct kmem_cache),
632 .flags = SLAB_PANIC,
633 .align = ARCH_KMALLOC_MINALIGN,
634};
635
622void __init kmem_cache_init(void) 636void __init kmem_cache_init(void)
623{ 637{
638 kmem_cache = &kmem_cache_boot;
624 slab_state = UP; 639 slab_state = UP;
625} 640}
626 641
diff --git a/mm/slub.c b/mm/slub.c
index 2fdd96f9e998..a0d698467f70 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -210,11 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *);
210static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 210static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
211static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 211static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
212 { return 0; } 212 { return 0; }
213static inline void sysfs_slab_remove(struct kmem_cache *s) 213static inline void sysfs_slab_remove(struct kmem_cache *s) { }
214{
215 kfree(s->name);
216 kfree(s);
217}
218 214
219#endif 215#endif
220 216
@@ -568,6 +564,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
568 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 564 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
569 printk(KERN_ERR "----------------------------------------" 565 printk(KERN_ERR "----------------------------------------"
570 "-------------------------------------\n\n"); 566 "-------------------------------------\n\n");
567
568 add_taint(TAINT_BAD_PAGE);
571} 569}
572 570
573static void slab_fix(struct kmem_cache *s, char *fmt, ...) 571static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -624,7 +622,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
624 print_trailer(s, page, object); 622 print_trailer(s, page, object);
625} 623}
626 624
627static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) 625static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
628{ 626{
629 va_list args; 627 va_list args;
630 char buf[100]; 628 char buf[100];
@@ -1069,13 +1067,13 @@ bad:
1069 return 0; 1067 return 0;
1070} 1068}
1071 1069
1072static noinline int free_debug_processing(struct kmem_cache *s, 1070static noinline struct kmem_cache_node *free_debug_processing(
1073 struct page *page, void *object, unsigned long addr) 1071 struct kmem_cache *s, struct page *page, void *object,
1072 unsigned long addr, unsigned long *flags)
1074{ 1073{
1075 unsigned long flags; 1074 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1076 int rc = 0;
1077 1075
1078 local_irq_save(flags); 1076 spin_lock_irqsave(&n->list_lock, *flags);
1079 slab_lock(page); 1077 slab_lock(page);
1080 1078
1081 if (!check_slab(s, page)) 1079 if (!check_slab(s, page))
@@ -1113,15 +1111,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1113 set_track(s, object, TRACK_FREE, addr); 1111 set_track(s, object, TRACK_FREE, addr);
1114 trace(s, page, object, 0); 1112 trace(s, page, object, 0);
1115 init_object(s, object, SLUB_RED_INACTIVE); 1113 init_object(s, object, SLUB_RED_INACTIVE);
1116 rc = 1;
1117out: 1114out:
1118 slab_unlock(page); 1115 slab_unlock(page);
1119 local_irq_restore(flags); 1116 /*
1120 return rc; 1117 * Keep node_lock to preserve integrity
1118 * until the object is actually freed
1119 */
1120 return n;
1121 1121
1122fail: 1122fail:
1123 slab_unlock(page);
1124 spin_unlock_irqrestore(&n->list_lock, *flags);
1123 slab_fix(s, "Object at 0x%p not freed", object); 1125 slab_fix(s, "Object at 0x%p not freed", object);
1124 goto out; 1126 return NULL;
1125} 1127}
1126 1128
1127static int __init setup_slub_debug(char *str) 1129static int __init setup_slub_debug(char *str)
@@ -1214,8 +1216,9 @@ static inline void setup_object_debug(struct kmem_cache *s,
1214static inline int alloc_debug_processing(struct kmem_cache *s, 1216static inline int alloc_debug_processing(struct kmem_cache *s,
1215 struct page *page, void *object, unsigned long addr) { return 0; } 1217 struct page *page, void *object, unsigned long addr) { return 0; }
1216 1218
1217static inline int free_debug_processing(struct kmem_cache *s, 1219static inline struct kmem_cache_node *free_debug_processing(
1218 struct page *page, void *object, unsigned long addr) { return 0; } 1220 struct kmem_cache *s, struct page *page, void *object,
1221 unsigned long addr, unsigned long *flags) { return NULL; }
1219 1222
1220static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1223static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1221 { return 1; } 1224 { return 1; }
@@ -1714,7 +1717,7 @@ static inline void note_cmpxchg_failure(const char *n,
1714 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1717 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1715} 1718}
1716 1719
1717void init_kmem_cache_cpus(struct kmem_cache *s) 1720static void init_kmem_cache_cpus(struct kmem_cache *s)
1718{ 1721{
1719 int cpu; 1722 int cpu;
1720 1723
@@ -1939,7 +1942,7 @@ static void unfreeze_partials(struct kmem_cache *s)
1939 * If we did not find a slot then simply move all the partials to the 1942 * If we did not find a slot then simply move all the partials to the
1940 * per node partial list. 1943 * per node partial list.
1941 */ 1944 */
1942int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 1945static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1943{ 1946{
1944 struct page *oldpage; 1947 struct page *oldpage;
1945 int pages; 1948 int pages;
@@ -1962,6 +1965,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1962 local_irq_save(flags); 1965 local_irq_save(flags);
1963 unfreeze_partials(s); 1966 unfreeze_partials(s);
1964 local_irq_restore(flags); 1967 local_irq_restore(flags);
1968 oldpage = NULL;
1965 pobjects = 0; 1969 pobjects = 0;
1966 pages = 0; 1970 pages = 0;
1967 stat(s, CPU_PARTIAL_DRAIN); 1971 stat(s, CPU_PARTIAL_DRAIN);
@@ -2310,7 +2314,7 @@ new_slab:
2310 * 2314 *
2311 * Otherwise we can simply pick the next object from the lockless free list. 2315 * Otherwise we can simply pick the next object from the lockless free list.
2312 */ 2316 */
2313static __always_inline void *slab_alloc(struct kmem_cache *s, 2317static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2314 gfp_t gfpflags, int node, unsigned long addr) 2318 gfp_t gfpflags, int node, unsigned long addr)
2315{ 2319{
2316 void **object; 2320 void **object;
@@ -2380,9 +2384,15 @@ redo:
2380 return object; 2384 return object;
2381} 2385}
2382 2386
2387static __always_inline void *slab_alloc(struct kmem_cache *s,
2388 gfp_t gfpflags, unsigned long addr)
2389{
2390 return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2391}
2392
2383void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2393void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2384{ 2394{
2385 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2395 void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2386 2396
2387 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); 2397 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
2388 2398
@@ -2393,7 +2403,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2393#ifdef CONFIG_TRACING 2403#ifdef CONFIG_TRACING
2394void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2404void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2395{ 2405{
2396 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 2406 void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2397 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2407 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2398 return ret; 2408 return ret;
2399} 2409}
@@ -2411,7 +2421,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
2411#ifdef CONFIG_NUMA 2421#ifdef CONFIG_NUMA
2412void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2422void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2413{ 2423{
2414 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2424 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2415 2425
2416 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2426 trace_kmem_cache_alloc_node(_RET_IP_, ret,
2417 s->object_size, s->size, gfpflags, node); 2427 s->object_size, s->size, gfpflags, node);
@@ -2425,7 +2435,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2425 gfp_t gfpflags, 2435 gfp_t gfpflags,
2426 int node, size_t size) 2436 int node, size_t size)
2427{ 2437{
2428 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); 2438 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
2429 2439
2430 trace_kmalloc_node(_RET_IP_, ret, 2440 trace_kmalloc_node(_RET_IP_, ret,
2431 size, s->size, gfpflags, node); 2441 size, s->size, gfpflags, node);
@@ -2457,7 +2467,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2457 2467
2458 stat(s, FREE_SLOWPATH); 2468 stat(s, FREE_SLOWPATH);
2459 2469
2460 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2470 if (kmem_cache_debug(s) &&
2471 !(n = free_debug_processing(s, page, x, addr, &flags)))
2461 return; 2472 return;
2462 2473
2463 do { 2474 do {
@@ -2612,6 +2623,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
2612 2623
2613 page = virt_to_head_page(x); 2624 page = virt_to_head_page(x);
2614 2625
2626 if (kmem_cache_debug(s) && page->slab != s) {
2627 pr_err("kmem_cache_free: Wrong slab cache. %s but object"
2628 " is from %s\n", page->slab->name, s->name);
2629 WARN_ON_ONCE(1);
2630 return;
2631 }
2632
2615 slab_free(s, page, x, _RET_IP_); 2633 slab_free(s, page, x, _RET_IP_);
2616 2634
2617 trace_kmem_cache_free(_RET_IP_, x); 2635 trace_kmem_cache_free(_RET_IP_, x);
@@ -3026,17 +3044,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3026 3044
3027} 3045}
3028 3046
3029static int kmem_cache_open(struct kmem_cache *s, 3047static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3030 const char *name, size_t size,
3031 size_t align, unsigned long flags,
3032 void (*ctor)(void *))
3033{ 3048{
3034 memset(s, 0, kmem_size); 3049 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3035 s->name = name;
3036 s->ctor = ctor;
3037 s->object_size = size;
3038 s->align = align;
3039 s->flags = kmem_cache_flags(size, flags, name, ctor);
3040 s->reserved = 0; 3050 s->reserved = 0;
3041 3051
3042 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 3052 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
@@ -3098,7 +3108,6 @@ static int kmem_cache_open(struct kmem_cache *s,
3098 else 3108 else
3099 s->cpu_partial = 30; 3109 s->cpu_partial = 30;
3100 3110
3101 s->refcount = 1;
3102#ifdef CONFIG_NUMA 3111#ifdef CONFIG_NUMA
3103 s->remote_node_defrag_ratio = 1000; 3112 s->remote_node_defrag_ratio = 1000;
3104#endif 3113#endif
@@ -3106,16 +3115,16 @@ static int kmem_cache_open(struct kmem_cache *s,
3106 goto error; 3115 goto error;
3107 3116
3108 if (alloc_kmem_cache_cpus(s)) 3117 if (alloc_kmem_cache_cpus(s))
3109 return 1; 3118 return 0;
3110 3119
3111 free_kmem_cache_nodes(s); 3120 free_kmem_cache_nodes(s);
3112error: 3121error:
3113 if (flags & SLAB_PANIC) 3122 if (flags & SLAB_PANIC)
3114 panic("Cannot create slab %s size=%lu realsize=%u " 3123 panic("Cannot create slab %s size=%lu realsize=%u "
3115 "order=%u offset=%u flags=%lx\n", 3124 "order=%u offset=%u flags=%lx\n",
3116 s->name, (unsigned long)size, s->size, oo_order(s->oo), 3125 s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
3117 s->offset, flags); 3126 s->offset, flags);
3118 return 0; 3127 return -EINVAL;
3119} 3128}
3120 3129
3121/* 3130/*
@@ -3137,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
3137 sizeof(long), GFP_ATOMIC); 3146 sizeof(long), GFP_ATOMIC);
3138 if (!map) 3147 if (!map)
3139 return; 3148 return;
3140 slab_err(s, page, "%s", text); 3149 slab_err(s, page, text, s->name);
3141 slab_lock(page); 3150 slab_lock(page);
3142 3151
3143 get_map(s, page, map); 3152 get_map(s, page, map);
@@ -3169,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3169 discard_slab(s, page); 3178 discard_slab(s, page);
3170 } else { 3179 } else {
3171 list_slab_objects(s, page, 3180 list_slab_objects(s, page,
3172 "Objects remaining on kmem_cache_close()"); 3181 "Objects remaining in %s on kmem_cache_close()");
3173 } 3182 }
3174 } 3183 }
3175} 3184}
@@ -3182,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
3182 int node; 3191 int node;
3183 3192
3184 flush_all(s); 3193 flush_all(s);
3185 free_percpu(s->cpu_slab);
3186 /* Attempt to free all objects */ 3194 /* Attempt to free all objects */
3187 for_each_node_state(node, N_NORMAL_MEMORY) { 3195 for_each_node_state(node, N_NORMAL_MEMORY) {
3188 struct kmem_cache_node *n = get_node(s, node); 3196 struct kmem_cache_node *n = get_node(s, node);
@@ -3191,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s)
3191 if (n->nr_partial || slabs_node(s, node)) 3199 if (n->nr_partial || slabs_node(s, node))
3192 return 1; 3200 return 1;
3193 } 3201 }
3202 free_percpu(s->cpu_slab);
3194 free_kmem_cache_nodes(s); 3203 free_kmem_cache_nodes(s);
3195 return 0; 3204 return 0;
3196} 3205}
3197 3206
3198/* 3207int __kmem_cache_shutdown(struct kmem_cache *s)
3199 * Close a cache and release the kmem_cache structure
3200 * (must be used for caches created using kmem_cache_create)
3201 */
3202void kmem_cache_destroy(struct kmem_cache *s)
3203{ 3208{
3204 mutex_lock(&slab_mutex); 3209 int rc = kmem_cache_close(s);
3205 s->refcount--; 3210
3206 if (!s->refcount) { 3211 if (!rc)
3207 list_del(&s->list);
3208 mutex_unlock(&slab_mutex);
3209 if (kmem_cache_close(s)) {
3210 printk(KERN_ERR "SLUB %s: %s called for cache that "
3211 "still has objects.\n", s->name, __func__);
3212 dump_stack();
3213 }
3214 if (s->flags & SLAB_DESTROY_BY_RCU)
3215 rcu_barrier();
3216 sysfs_slab_remove(s); 3212 sysfs_slab_remove(s);
3217 } else 3213
3218 mutex_unlock(&slab_mutex); 3214 return rc;
3219} 3215}
3220EXPORT_SYMBOL(kmem_cache_destroy);
3221 3216
3222/******************************************************************** 3217/********************************************************************
3223 * Kmalloc subsystem 3218 * Kmalloc subsystem
@@ -3226,8 +3221,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
3226struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; 3221struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
3227EXPORT_SYMBOL(kmalloc_caches); 3222EXPORT_SYMBOL(kmalloc_caches);
3228 3223
3229static struct kmem_cache *kmem_cache;
3230
3231#ifdef CONFIG_ZONE_DMA 3224#ifdef CONFIG_ZONE_DMA
3232static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; 3225static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
3233#endif 3226#endif
@@ -3273,14 +3266,17 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3273{ 3266{
3274 struct kmem_cache *s; 3267 struct kmem_cache *s;
3275 3268
3276 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3269 s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3270
3271 s->name = name;
3272 s->size = s->object_size = size;
3273 s->align = ARCH_KMALLOC_MINALIGN;
3277 3274
3278 /* 3275 /*
3279 * This function is called with IRQs disabled during early-boot on 3276 * This function is called with IRQs disabled during early-boot on
3280 * single CPU so there's no need to take slab_mutex here. 3277 * single CPU so there's no need to take slab_mutex here.
3281 */ 3278 */
3282 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, 3279 if (kmem_cache_open(s, flags))
3283 flags, NULL))
3284 goto panic; 3280 goto panic;
3285 3281
3286 list_add(&s->list, &slab_caches); 3282 list_add(&s->list, &slab_caches);
@@ -3362,7 +3358,7 @@ void *__kmalloc(size_t size, gfp_t flags)
3362 if (unlikely(ZERO_OR_NULL_PTR(s))) 3358 if (unlikely(ZERO_OR_NULL_PTR(s)))
3363 return s; 3359 return s;
3364 3360
3365 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); 3361 ret = slab_alloc(s, flags, _RET_IP_);
3366 3362
3367 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3363 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3368 3364
@@ -3405,7 +3401,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3405 if (unlikely(ZERO_OR_NULL_PTR(s))) 3401 if (unlikely(ZERO_OR_NULL_PTR(s)))
3406 return s; 3402 return s;
3407 3403
3408 ret = slab_alloc(s, flags, node, _RET_IP_); 3404 ret = slab_alloc_node(s, flags, node, _RET_IP_);
3409 3405
3410 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3406 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3411 3407
@@ -3482,7 +3478,7 @@ void kfree(const void *x)
3482 if (unlikely(!PageSlab(page))) { 3478 if (unlikely(!PageSlab(page))) {
3483 BUG_ON(!PageCompound(page)); 3479 BUG_ON(!PageCompound(page));
3484 kmemleak_free(x); 3480 kmemleak_free(x);
3485 put_page(page); 3481 __free_pages(page, compound_order(page));
3486 return; 3482 return;
3487 } 3483 }
3488 slab_free(page->slab, page, object, _RET_IP_); 3484 slab_free(page->slab, page, object, _RET_IP_);
@@ -3719,12 +3715,12 @@ void __init kmem_cache_init(void)
3719 slub_max_order = 0; 3715 slub_max_order = 0;
3720 3716
3721 kmem_size = offsetof(struct kmem_cache, node) + 3717 kmem_size = offsetof(struct kmem_cache, node) +
3722 nr_node_ids * sizeof(struct kmem_cache_node *); 3718 nr_node_ids * sizeof(struct kmem_cache_node *);
3723 3719
3724 /* Allocate two kmem_caches from the page allocator */ 3720 /* Allocate two kmem_caches from the page allocator */
3725 kmalloc_size = ALIGN(kmem_size, cache_line_size()); 3721 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3726 order = get_order(2 * kmalloc_size); 3722 order = get_order(2 * kmalloc_size);
3727 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); 3723 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
3728 3724
3729 /* 3725 /*
3730 * Must first have the slab cache available for the allocations of the 3726 * Must first have the slab cache available for the allocations of the
@@ -3733,9 +3729,10 @@ void __init kmem_cache_init(void)
3733 */ 3729 */
3734 kmem_cache_node = (void *)kmem_cache + kmalloc_size; 3730 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3735 3731
3736 kmem_cache_open(kmem_cache_node, "kmem_cache_node", 3732 kmem_cache_node->name = "kmem_cache_node";
3737 sizeof(struct kmem_cache_node), 3733 kmem_cache_node->size = kmem_cache_node->object_size =
3738 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3734 sizeof(struct kmem_cache_node);
3735 kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3739 3736
3740 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3737 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3741 3738
@@ -3743,8 +3740,10 @@ void __init kmem_cache_init(void)
3743 slab_state = PARTIAL; 3740 slab_state = PARTIAL;
3744 3741
3745 temp_kmem_cache = kmem_cache; 3742 temp_kmem_cache = kmem_cache;
3746 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, 3743 kmem_cache->name = "kmem_cache";
3747 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 3744 kmem_cache->size = kmem_cache->object_size = kmem_size;
3745 kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3746
3748 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); 3747 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3749 memcpy(kmem_cache, temp_kmem_cache, kmem_size); 3748 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3750 3749
@@ -3933,11 +3932,10 @@ static struct kmem_cache *find_mergeable(size_t size,
3933 return NULL; 3932 return NULL;
3934} 3933}
3935 3934
3936struct kmem_cache *__kmem_cache_create(const char *name, size_t size, 3935struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
3937 size_t align, unsigned long flags, void (*ctor)(void *)) 3936 size_t align, unsigned long flags, void (*ctor)(void *))
3938{ 3937{
3939 struct kmem_cache *s; 3938 struct kmem_cache *s;
3940 char *n;
3941 3939
3942 s = find_mergeable(size, align, flags, name, ctor); 3940 s = find_mergeable(size, align, flags, name, ctor);
3943 if (s) { 3941 if (s) {
@@ -3951,36 +3949,29 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
3951 3949
3952 if (sysfs_slab_alias(s, name)) { 3950 if (sysfs_slab_alias(s, name)) {
3953 s->refcount--; 3951 s->refcount--;
3954 return NULL; 3952 s = NULL;
3955 } 3953 }
3956 return s;
3957 } 3954 }
3958 3955
3959 n = kstrdup(name, GFP_KERNEL); 3956 return s;
3960 if (!n) 3957}
3961 return NULL;
3962 3958
3963 s = kmalloc(kmem_size, GFP_KERNEL); 3959int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3964 if (s) { 3960{
3965 if (kmem_cache_open(s, n, 3961 int err;
3966 size, align, flags, ctor)) {
3967 int r;
3968 3962
3969 list_add(&s->list, &slab_caches); 3963 err = kmem_cache_open(s, flags);
3970 mutex_unlock(&slab_mutex); 3964 if (err)
3971 r = sysfs_slab_add(s); 3965 return err;
3972 mutex_lock(&slab_mutex);
3973 3966
3974 if (!r) 3967 mutex_unlock(&slab_mutex);
3975 return s; 3968 err = sysfs_slab_add(s);
3969 mutex_lock(&slab_mutex);
3976 3970
3977 list_del(&s->list); 3971 if (err)
3978 kmem_cache_close(s); 3972 kmem_cache_close(s);
3979 } 3973
3980 kfree(s); 3974 return err;
3981 }
3982 kfree(n);
3983 return NULL;
3984} 3975}
3985 3976
3986#ifdef CONFIG_SMP 3977#ifdef CONFIG_SMP
@@ -4033,7 +4024,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
4033 if (unlikely(ZERO_OR_NULL_PTR(s))) 4024 if (unlikely(ZERO_OR_NULL_PTR(s)))
4034 return s; 4025 return s;
4035 4026
4036 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 4027 ret = slab_alloc(s, gfpflags, caller);
4037 4028
4038 /* Honor the call site pointer we received. */ 4029 /* Honor the call site pointer we received. */
4039 trace_kmalloc(caller, ret, size, s->size, gfpflags); 4030 trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4063,7 +4054,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
4063 if (unlikely(ZERO_OR_NULL_PTR(s))) 4054 if (unlikely(ZERO_OR_NULL_PTR(s)))
4064 return s; 4055 return s;
4065 4056
4066 ret = slab_alloc(s, gfpflags, node, caller); 4057 ret = slab_alloc_node(s, gfpflags, node, caller);
4067 4058
4068 /* Honor the call site pointer we received. */ 4059 /* Honor the call site pointer we received. */
4069 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 4060 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -5210,14 +5201,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
5210 return err; 5201 return err;
5211} 5202}
5212 5203
5213static void kmem_cache_release(struct kobject *kobj)
5214{
5215 struct kmem_cache *s = to_slab(kobj);
5216
5217 kfree(s->name);
5218 kfree(s);
5219}
5220
5221static const struct sysfs_ops slab_sysfs_ops = { 5204static const struct sysfs_ops slab_sysfs_ops = {
5222 .show = slab_attr_show, 5205 .show = slab_attr_show,
5223 .store = slab_attr_store, 5206 .store = slab_attr_store,
@@ -5225,7 +5208,6 @@ static const struct sysfs_ops slab_sysfs_ops = {
5225 5208
5226static struct kobj_type slab_ktype = { 5209static struct kobj_type slab_ktype = {
5227 .sysfs_ops = &slab_sysfs_ops, 5210 .sysfs_ops = &slab_sysfs_ops,
5228 .release = kmem_cache_release
5229}; 5211};
5230 5212
5231static int uevent_filter(struct kset *kset, struct kobject *kobj) 5213static int uevent_filter(struct kset *kset, struct kobject *kobj)
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
446} 446}
447EXPORT_SYMBOL(mark_page_accessed); 447EXPORT_SYMBOL(mark_page_accessed);
448 448
449/*
450 * Order of operations is important: flush the pagevec when it's already
451 * full, not when adding the last page, to make sure that last page is
452 * not added to the LRU directly when passed to this function. Because
453 * mark_page_accessed() (called after this when writing) only activates
454 * pages that are on the LRU, linear writes in subpage chunks would see
455 * every PAGEVEC_SIZE page activated, which is unexpected.
456 */
449void __lru_cache_add(struct page *page, enum lru_list lru) 457void __lru_cache_add(struct page *page, enum lru_list lru)
450{ 458{
451 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 459 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
452 460
453 page_cache_get(page); 461 page_cache_get(page);
454 if (!pagevec_add(pvec, page)) 462 if (!pagevec_space(pvec))
455 __pagevec_lru_add(pvec, lru); 463 __pagevec_lru_add(pvec, lru);
464 pagevec_add(pvec, page);
456 put_cpu_var(lru_add_pvecs); 465 put_cpu_var(lru_add_pvecs);
457} 466}
458EXPORT_SYMBOL(__lru_cache_add); 467EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
742 751
743 SetPageLRU(page_tail); 752 SetPageLRU(page_tail);
744 753
745 if (page_evictable(page_tail, NULL)) { 754 if (page_evictable(page_tail)) {
746 if (PageActive(page)) { 755 if (PageActive(page)) {
747 SetPageActive(page_tail); 756 SetPageActive(page_tail);
748 active = 1; 757 active = 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14e254c768fc..71cd288b2001 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1483,7 +1483,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1483 struct file *swap_file, *victim; 1483 struct file *swap_file, *victim;
1484 struct address_space *mapping; 1484 struct address_space *mapping;
1485 struct inode *inode; 1485 struct inode *inode;
1486 char *pathname; 1486 struct filename *pathname;
1487 int oom_score_adj; 1487 int oom_score_adj;
1488 int i, type, prev; 1488 int i, type, prev;
1489 int err; 1489 int err;
@@ -1498,8 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1498 if (IS_ERR(pathname)) 1498 if (IS_ERR(pathname))
1499 goto out; 1499 goto out;
1500 1500
1501 victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 1501 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1502 putname(pathname);
1503 err = PTR_ERR(victim); 1502 err = PTR_ERR(victim);
1504 if (IS_ERR(victim)) 1503 if (IS_ERR(victim))
1505 goto out; 1504 goto out;
@@ -1936,7 +1935,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
1936SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1935SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1937{ 1936{
1938 struct swap_info_struct *p; 1937 struct swap_info_struct *p;
1939 char *name; 1938 struct filename *name;
1940 struct file *swap_file = NULL; 1939 struct file *swap_file = NULL;
1941 struct address_space *mapping; 1940 struct address_space *mapping;
1942 int i; 1941 int i;
@@ -1967,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1967 name = NULL; 1966 name = NULL;
1968 goto bad_swap; 1967 goto bad_swap;
1969 } 1968 }
1970 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 1969 swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
1971 if (IS_ERR(swap_file)) { 1970 if (IS_ERR(swap_file)) {
1972 error = PTR_ERR(swap_file); 1971 error = PTR_ERR(swap_file);
1973 swap_file = NULL; 1972 swap_file = NULL;
@@ -2053,7 +2052,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2053 2052
2054 printk(KERN_INFO "Adding %uk swap on %s. " 2053 printk(KERN_INFO "Adding %uk swap on %s. "
2055 "Priority:%d extents:%d across:%lluk %s%s%s\n", 2054 "Priority:%d extents:%d across:%lluk %s%s%s\n",
2056 p->pages<<(PAGE_SHIFT-10), name, p->prio, 2055 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2057 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2056 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2058 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2057 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2059 (p->flags & SWP_DISCARDABLE) ? "D" : "", 2058 (p->flags & SWP_DISCARDABLE) ? "D" : "",
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
107 107
108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 108 cancel_dirty_page(page, PAGE_CACHE_SIZE);
109 109
110 clear_page_mlock(page);
111 ClearPageMappedToDisk(page); 110 ClearPageMappedToDisk(page);
112 delete_from_page_cache(page); 111 delete_from_page_cache(page);
113 return 0; 112 return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
132 if (page_has_private(page) && !try_to_release_page(page, 0)) 131 if (page_has_private(page) && !try_to_release_page(page, 0))
133 return 0; 132 return 0;
134 133
135 clear_page_mlock(page);
136 ret = remove_mapping(mapping, page); 134 ret = remove_mapping(mapping, page);
137 135
138 return ret; 136 return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
398 if (PageDirty(page)) 396 if (PageDirty(page))
399 goto failed; 397 goto failed;
400 398
401 clear_page_mlock(page);
402 BUG_ON(page_has_private(page)); 399 BUG_ON(page_has_private(page));
403 __delete_from_page_cache(page); 400 __delete_from_page_cache(page);
404 spin_unlock_irq(&mapping->tree_lock); 401 spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/util.c b/mm/util.c
index 3a5278c08d76..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len)
105} 105}
106EXPORT_SYMBOL(memdup_user); 106EXPORT_SYMBOL(memdup_user);
107 107
108static __always_inline void *__do_krealloc(const void *p, size_t new_size,
109 gfp_t flags)
110{
111 void *ret;
112 size_t ks = 0;
113
114 if (p)
115 ks = ksize(p);
116
117 if (ks >= new_size)
118 return (void *)p;
119
120 ret = kmalloc_track_caller(new_size, flags);
121 if (ret && p)
122 memcpy(ret, p, ks);
123
124 return ret;
125}
126
108/** 127/**
109 * __krealloc - like krealloc() but don't free @p. 128 * __krealloc - like krealloc() but don't free @p.
110 * @p: object to reallocate memory for. 129 * @p: object to reallocate memory for.
@@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user);
117 */ 136 */
118void *__krealloc(const void *p, size_t new_size, gfp_t flags) 137void *__krealloc(const void *p, size_t new_size, gfp_t flags)
119{ 138{
120 void *ret;
121 size_t ks = 0;
122
123 if (unlikely(!new_size)) 139 if (unlikely(!new_size))
124 return ZERO_SIZE_PTR; 140 return ZERO_SIZE_PTR;
125 141
126 if (p) 142 return __do_krealloc(p, new_size, flags);
127 ks = ksize(p);
128 143
129 if (ks >= new_size)
130 return (void *)p;
131
132 ret = kmalloc_track_caller(new_size, flags);
133 if (ret && p)
134 memcpy(ret, p, ks);
135
136 return ret;
137} 144}
138EXPORT_SYMBOL(__krealloc); 145EXPORT_SYMBOL(__krealloc);
139 146
@@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
157 return ZERO_SIZE_PTR; 164 return ZERO_SIZE_PTR;
158 } 165 }
159 166
160 ret = __krealloc(p, new_size, flags); 167 ret = __do_krealloc(p, new_size, flags);
161 if (ret && p != ret) 168 if (ret && p != ret)
162 kfree(p); 169 kfree(p);
163 170
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..78e08300db21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2163 usize -= PAGE_SIZE; 2163 usize -= PAGE_SIZE;
2164 } while (usize > 0); 2164 } while (usize > 0);
2165 2165
2166 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 2166 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2167 vma->vm_flags |= VM_RESERVED;
2168 2167
2169 return 0; 2168 return 0;
2170} 2169}
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
2572{ 2571{
2573 struct vm_struct *v = p; 2572 struct vm_struct *v = p;
2574 2573
2575 seq_printf(m, "0x%p-0x%p %7ld", 2574 seq_printf(m, "0x%pK-0x%pK %7ld",
2576 v->addr, v->addr + v->size, v->size); 2575 v->addr, v->addr + v->size, v->size);
2577 2576
2578 if (v->caller) 2577 if (v->caller)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..2624edcfb420 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
553redo: 553redo:
554 ClearPageUnevictable(page); 554 ClearPageUnevictable(page);
555 555
556 if (page_evictable(page, NULL)) { 556 if (page_evictable(page)) {
557 /* 557 /*
558 * For evictable pages, we can use the cache. 558 * For evictable pages, we can use the cache.
559 * In event of a race, worst case is we end up with an 559 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
587 * page is on unevictable list, it never be freed. To avoid that, 587 * page is on unevictable list, it never be freed. To avoid that,
588 * check after we added it to the list, again. 588 * check after we added it to the list, again.
589 */ 589 */
590 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { 590 if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
591 if (!isolate_lru_page(page)) { 591 if (!isolate_lru_page(page)) {
592 put_page(page); 592 put_page(page);
593 goto redo; 593 goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
674static unsigned long shrink_page_list(struct list_head *page_list, 674static unsigned long shrink_page_list(struct list_head *page_list,
675 struct zone *zone, 675 struct zone *zone,
676 struct scan_control *sc, 676 struct scan_control *sc,
677 enum ttu_flags ttu_flags,
677 unsigned long *ret_nr_dirty, 678 unsigned long *ret_nr_dirty,
678 unsigned long *ret_nr_writeback) 679 unsigned long *ret_nr_writeback,
680 bool force_reclaim)
679{ 681{
680 LIST_HEAD(ret_pages); 682 LIST_HEAD(ret_pages);
681 LIST_HEAD(free_pages); 683 LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
689 691
690 mem_cgroup_uncharge_start(); 692 mem_cgroup_uncharge_start();
691 while (!list_empty(page_list)) { 693 while (!list_empty(page_list)) {
692 enum page_references references;
693 struct address_space *mapping; 694 struct address_space *mapping;
694 struct page *page; 695 struct page *page;
695 int may_enter_fs; 696 int may_enter_fs;
697 enum page_references references = PAGEREF_RECLAIM_CLEAN;
696 698
697 cond_resched(); 699 cond_resched();
698 700
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
707 709
708 sc->nr_scanned++; 710 sc->nr_scanned++;
709 711
710 if (unlikely(!page_evictable(page, NULL))) 712 if (unlikely(!page_evictable(page)))
711 goto cull_mlocked; 713 goto cull_mlocked;
712 714
713 if (!sc->may_unmap && page_mapped(page)) 715 if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
758 wait_on_page_writeback(page); 760 wait_on_page_writeback(page);
759 } 761 }
760 762
761 references = page_check_references(page, sc); 763 if (!force_reclaim)
764 references = page_check_references(page, sc);
765
762 switch (references) { 766 switch (references) {
763 case PAGEREF_ACTIVATE: 767 case PAGEREF_ACTIVATE:
764 goto activate_locked; 768 goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
788 * processes. Try to unmap it here. 792 * processes. Try to unmap it here.
789 */ 793 */
790 if (page_mapped(page) && mapping) { 794 if (page_mapped(page) && mapping) {
791 switch (try_to_unmap(page, TTU_UNMAP)) { 795 switch (try_to_unmap(page, ttu_flags)) {
792 case SWAP_FAIL: 796 case SWAP_FAIL:
793 goto activate_locked; 797 goto activate_locked;
794 case SWAP_AGAIN: 798 case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
960 return nr_reclaimed; 964 return nr_reclaimed;
961} 965}
962 966
967unsigned long reclaim_clean_pages_from_list(struct zone *zone,
968 struct list_head *page_list)
969{
970 struct scan_control sc = {
971 .gfp_mask = GFP_KERNEL,
972 .priority = DEF_PRIORITY,
973 .may_unmap = 1,
974 };
975 unsigned long ret, dummy1, dummy2;
976 struct page *page, *next;
977 LIST_HEAD(clean_pages);
978
979 list_for_each_entry_safe(page, next, page_list, lru) {
980 if (page_is_file_cache(page) && !PageDirty(page)) {
981 ClearPageActive(page);
982 list_move(&page->lru, &clean_pages);
983 }
984 }
985
986 ret = shrink_page_list(&clean_pages, zone, &sc,
987 TTU_UNMAP|TTU_IGNORE_ACCESS,
988 &dummy1, &dummy2, true);
989 list_splice(&clean_pages, page_list);
990 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
991 return ret;
992}
993
963/* 994/*
964 * Attempt to remove the specified page from its LRU. Only take this page 995 * Attempt to remove the specified page from its LRU. Only take this page
965 * if it is of the appropriate PageActive status. Pages which are being 996 * if it is of the appropriate PageActive status. Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
978 if (!PageLRU(page)) 1009 if (!PageLRU(page))
979 return ret; 1010 return ret;
980 1011
981 /* Do not give back unevictable pages for compaction */ 1012 /* Compaction should not handle unevictable pages but CMA can do so */
982 if (PageUnevictable(page)) 1013 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
983 return ret; 1014 return ret;
984 1015
985 ret = -EBUSY; 1016 ret = -EBUSY;
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1186 1217
1187 VM_BUG_ON(PageLRU(page)); 1218 VM_BUG_ON(PageLRU(page));
1188 list_del(&page->lru); 1219 list_del(&page->lru);
1189 if (unlikely(!page_evictable(page, NULL))) { 1220 if (unlikely(!page_evictable(page))) {
1190 spin_unlock_irq(&zone->lru_lock); 1221 spin_unlock_irq(&zone->lru_lock);
1191 putback_lru_page(page); 1222 putback_lru_page(page);
1192 spin_lock_irq(&zone->lru_lock); 1223 spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1278 if (nr_taken == 0) 1309 if (nr_taken == 0)
1279 return 0; 1310 return 0;
1280 1311
1281 nr_reclaimed = shrink_page_list(&page_list, zone, sc, 1312 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1282 &nr_dirty, &nr_writeback); 1313 &nr_dirty, &nr_writeback, false);
1283 1314
1284 spin_lock_irq(&zone->lru_lock); 1315 spin_lock_irq(&zone->lru_lock);
1285 1316
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1439 page = lru_to_page(&l_hold); 1470 page = lru_to_page(&l_hold);
1440 list_del(&page->lru); 1471 list_del(&page->lru);
1441 1472
1442 if (unlikely(!page_evictable(page, NULL))) { 1473 if (unlikely(!page_evictable(page))) {
1443 putback_lru_page(page); 1474 putback_lru_page(page);
1444 continue; 1475 continue;
1445 } 1476 }
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1729 return false; 1760 return false;
1730} 1761}
1731 1762
1763#ifdef CONFIG_COMPACTION
1764/*
1765 * If compaction is deferred for sc->order then scale the number of pages
1766 * reclaimed based on the number of consecutive allocation failures
1767 */
1768static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1769 struct lruvec *lruvec, struct scan_control *sc)
1770{
1771 struct zone *zone = lruvec_zone(lruvec);
1772
1773 if (zone->compact_order_failed <= sc->order)
1774 pages_for_compaction <<= zone->compact_defer_shift;
1775 return pages_for_compaction;
1776}
1777#else
1778static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1779 struct lruvec *lruvec, struct scan_control *sc)
1780{
1781 return pages_for_compaction;
1782}
1783#endif
1784
1732/* 1785/*
1733 * Reclaim/compaction is used for high-order allocation requests. It reclaims 1786 * Reclaim/compaction is used for high-order allocation requests. It reclaims
1734 * order-0 pages before compacting the zone. should_continue_reclaim() returns 1787 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1776 * inactive lists are large enough, continue reclaiming 1829 * inactive lists are large enough, continue reclaiming
1777 */ 1830 */
1778 pages_for_compaction = (2UL << sc->order); 1831 pages_for_compaction = (2UL << sc->order);
1832
1833 pages_for_compaction = scale_for_compaction(pages_for_compaction,
1834 lruvec, sc);
1779 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1835 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1780 if (nr_swap_pages > 0) 1836 if (nr_swap_pages > 0)
1781 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1837 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2839 */ 2895 */
2840 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2896 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2841 2897
2898 /*
2899 * Compaction records what page blocks it recently failed to
2900 * isolate pages from and skips them in the future scanning.
2901 * When kswapd is going to sleep, it is reasonable to assume
2902 * that pages and compaction may succeed so reset the cache.
2903 */
2904 reset_isolation_suitable(pgdat);
2905
2842 if (!kthread_should_stop()) 2906 if (!kthread_should_stop())
2843 schedule(); 2907 schedule();
2844 2908
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid)
3101 if (IS_ERR(pgdat->kswapd)) { 3165 if (IS_ERR(pgdat->kswapd)) {
3102 /* failure at boot is fatal */ 3166 /* failure at boot is fatal */
3103 BUG_ON(system_state == SYSTEM_BOOTING); 3167 BUG_ON(system_state == SYSTEM_BOOTING);
3104 printk("Failed to start kswapd on node %d\n",nid);
3105 pgdat->kswapd = NULL; 3168 pgdat->kswapd = NULL;
3106 ret = -1; 3169 pr_err("Failed to start kswapd on node %d\n", nid);
3170 ret = PTR_ERR(pgdat->kswapd);
3107 } 3171 }
3108 return ret; 3172 return ret;
3109} 3173}
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3350/* 3414/*
3351 * page_evictable - test whether a page is evictable 3415 * page_evictable - test whether a page is evictable
3352 * @page: the page to test 3416 * @page: the page to test
3353 * @vma: the VMA in which the page is or will be mapped, may be NULL
3354 * 3417 *
3355 * Test whether page is evictable--i.e., should be placed on active/inactive 3418 * Test whether page is evictable--i.e., should be placed on active/inactive
3356 * lists vs unevictable list. The vma argument is !NULL when called from the 3419 * lists vs unevictable list.
3357 * fault path to determine how to instantate a new page.
3358 * 3420 *
3359 * Reasons page might not be evictable: 3421 * Reasons page might not be evictable:
3360 * (1) page's mapping marked unevictable 3422 * (1) page's mapping marked unevictable
3361 * (2) page is part of an mlocked VMA 3423 * (2) page is part of an mlocked VMA
3362 * 3424 *
3363 */ 3425 */
3364int page_evictable(struct page *page, struct vm_area_struct *vma) 3426int page_evictable(struct page *page)
3365{ 3427{
3366 3428 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3367 if (mapping_unevictable(page_mapping(page)))
3368 return 0;
3369
3370 if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
3371 return 0;
3372
3373 return 1;
3374} 3429}
3375 3430
3376#ifdef CONFIG_SHMEM 3431#ifdef CONFIG_SHMEM
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3408 if (!PageLRU(page) || !PageUnevictable(page)) 3463 if (!PageLRU(page) || !PageUnevictable(page))
3409 continue; 3464 continue;
3410 3465
3411 if (page_evictable(page, NULL)) { 3466 if (page_evictable(page)) {
3412 enum lru_list lru = page_lru_base_type(page); 3467 enum lru_list lru = page_lru_base_type(page);
3413 3468
3414 VM_BUG_ON(PageActive(page)); 3469 VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a6748231d..c7370579111b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
495 atomic_long_add(global_diff[i], &vm_stat[i]); 495 atomic_long_add(global_diff[i], &vm_stat[i]);
496} 496}
497 497
498void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
499{
500 int i;
501
502 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
503 if (pset->vm_stat_diff[i]) {
504 int v = pset->vm_stat_diff[i];
505 pset->vm_stat_diff[i] = 0;
506 atomic_long_add(v, &zone->vm_stat[i]);
507 atomic_long_add(v, &vm_stat[i]);
508 }
509}
498#endif 510#endif
499 511
500#ifdef CONFIG_NUMA 512#ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
722 "numa_other", 734 "numa_other",
723#endif 735#endif
724 "nr_anon_transparent_hugepages", 736 "nr_anon_transparent_hugepages",
737 "nr_free_cma",
725 "nr_dirty_threshold", 738 "nr_dirty_threshold",
726 "nr_dirty_background_threshold", 739 "nr_dirty_background_threshold",
727 740
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = {
781 "unevictable_pgs_munlocked", 794 "unevictable_pgs_munlocked",
782 "unevictable_pgs_cleared", 795 "unevictable_pgs_cleared",
783 "unevictable_pgs_stranded", 796 "unevictable_pgs_stranded",
784 "unevictable_pgs_mlockfreed",
785 797
786#ifdef CONFIG_TRANSPARENT_HUGEPAGE 798#ifdef CONFIG_TRANSPARENT_HUGEPAGE
787 "thp_fault_alloc", 799 "thp_fault_alloc",
@@ -1157,7 +1169,7 @@ static void __cpuinit start_cpu_timer(int cpu)
1157{ 1169{
1158 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1170 struct delayed_work *work = &per_cpu(vmstat_work, cpu);
1159 1171
1160 INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); 1172 INIT_DEFERRABLE_WORK(work, vmstat_update);
1161 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1173 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
1162} 1174}
1163 1175