aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile7
-rw-r--r--mm/backing-dev.c520
-rw-r--r--mm/bootmem.c13
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c44
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/highmem.c61
-rw-r--r--mm/hugetlb.c312
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c80
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memblock.c837
-rw-r--r--mm/memcontrol.c870
-rw-r--r--mm/memory-failure.c327
-rw-r--r--mm/memory.c136
-rw-r--r--mm/memory_hotplug.c66
-rw-r--r--mm/mempolicy.c99
-rw-r--r--mm/migrate.c259
-rw-r--r--mm/mlock.c13
-rw-r--r--mm/mmap.c77
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c63
-rw-r--r--mm/oom_kill.c725
-rw-r--r--mm/page-writeback.c309
-rw-r--r--mm/page_alloc.c255
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/percpu-km.c8
-rw-r--r--mm/percpu.c476
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/rmap.c227
-rw-r--r--mm/shmem.c156
-rw-r--r--mm/slab.c11
-rw-r--r--mm/slob.c18
-rw-r--r--mm/slub.c871
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swapfile.c78
-rw-r--r--mm/truncate.c38
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c80
-rw-r--r--mm/vmscan.c771
-rw-r--r--mm/vmstat.c68
50 files changed, 5193 insertions, 2931 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f4e516e9c37..c2c8a4a1189 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -189,7 +189,7 @@ config COMPACTION
189config MIGRATION 189config MIGRATION
190 bool "Page migration" 190 bool "Page migration"
191 def_bool y 191 def_bool y
192 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE 192 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
193 help 193 help
194 Allows the migration of the physical location of pages of processes 194 Allows the migration of the physical location of pages of processes
195 while the virtual addresses are not changed. This is useful in 195 while the virtual addresses are not changed. This is useful in
@@ -301,3 +301,11 @@ config NOMMU_INITIAL_TRIM_EXCESS
301 of 1 says that all excess pages should be trimmed. 301 of 1 says that all excess pages should be trimmed.
302 302
303 See Documentation/nommu-mmap.txt for more information. 303 See Documentation/nommu-mmap.txt for more information.
304
305#
306# UP and nommu archs use km based percpu allocator
307#
308config NEED_PER_CPU_KM
309 depends on !SMP
310 bool
311 default y
diff --git a/mm/Makefile b/mm/Makefile
index 34b2546a9e3..f73f75a29f8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o mmu_context.o \ 14 page_isolation.o mm_init.o mmu_context.o percpu.o \
15 $(mmu-y) 15 $(mmu-y)
16obj-y += init-mm.o 16obj-y += init-mm.o
17 17
@@ -36,11 +36,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
36obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 36obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
37obj-$(CONFIG_FS_XIP) += filemap_xip.o 37obj-$(CONFIG_FS_XIP) += filemap_xip.o
38obj-$(CONFIG_MIGRATION) += migrate.o 38obj-$(CONFIG_MIGRATION) += migrate.o
39ifdef CONFIG_SMP
40obj-y += percpu.o
41else
42obj-y += percpu_up.o
43endif
44obj-$(CONFIG_QUICKLIST) += quicklist.o 39obj-$(CONFIG_QUICKLIST) += quicklist.o
45obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
46obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 41obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 123bcef13e5..027100d3022 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/device.h> 12#include <linux/device.h>
13#include <trace/events/writeback.h>
13 14
14static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
15 16
@@ -29,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
29 30
30struct backing_dev_info noop_backing_dev_info = { 31struct backing_dev_info noop_backing_dev_info = {
31 .name = "noop", 32 .name = "noop",
33 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
32}; 34};
33EXPORT_SYMBOL_GPL(noop_backing_dev_info); 35EXPORT_SYMBOL_GPL(noop_backing_dev_info);
34 36
@@ -49,8 +51,6 @@ static struct timer_list sync_supers_timer;
49static int bdi_sync_supers(void *); 51static int bdi_sync_supers(void *);
50static void sync_supers_timer_fn(unsigned long); 52static void sync_supers_timer_fn(unsigned long);
51 53
52static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
53
54#ifdef CONFIG_DEBUG_FS 54#ifdef CONFIG_DEBUG_FS
55#include <linux/debugfs.h> 55#include <linux/debugfs.h>
56#include <linux/seq_file.h> 56#include <linux/seq_file.h>
@@ -65,31 +65,25 @@ static void bdi_debug_init(void)
65static int bdi_debug_stats_show(struct seq_file *m, void *v) 65static int bdi_debug_stats_show(struct seq_file *m, void *v)
66{ 66{
67 struct backing_dev_info *bdi = m->private; 67 struct backing_dev_info *bdi = m->private;
68 struct bdi_writeback *wb; 68 struct bdi_writeback *wb = &bdi->wb;
69 unsigned long background_thresh; 69 unsigned long background_thresh;
70 unsigned long dirty_thresh; 70 unsigned long dirty_thresh;
71 unsigned long bdi_thresh; 71 unsigned long bdi_thresh;
72 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; 72 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
73 struct inode *inode; 73 struct inode *inode;
74 74
75 /*
76 * inode lock is enough here, the bdi->wb_list is protected by
77 * RCU on the reader side
78 */
79 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 75 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
80 spin_lock(&inode_lock); 76 spin_lock(&inode_lock);
81 list_for_each_entry(wb, &bdi->wb_list, list) { 77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
82 nr_wb++; 78 nr_dirty++;
83 list_for_each_entry(inode, &wb->b_dirty, i_list) 79 list_for_each_entry(inode, &wb->b_io, i_wb_list)
84 nr_dirty++; 80 nr_io++;
85 list_for_each_entry(inode, &wb->b_io, i_list) 81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
86 nr_io++; 82 nr_more_io++;
87 list_for_each_entry(inode, &wb->b_more_io, i_list)
88 nr_more_io++;
89 }
90 spin_unlock(&inode_lock); 83 spin_unlock(&inode_lock);
91 84
92 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 85 global_dirty_limits(&background_thresh, &dirty_thresh);
86 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
93 87
94#define K(x) ((x) << (PAGE_SHIFT - 10)) 88#define K(x) ((x) << (PAGE_SHIFT - 10))
95 seq_printf(m, 89 seq_printf(m,
@@ -98,19 +92,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
98 "BdiDirtyThresh: %8lu kB\n" 92 "BdiDirtyThresh: %8lu kB\n"
99 "DirtyThresh: %8lu kB\n" 93 "DirtyThresh: %8lu kB\n"
100 "BackgroundThresh: %8lu kB\n" 94 "BackgroundThresh: %8lu kB\n"
101 "WritebackThreads: %8lu\n"
102 "b_dirty: %8lu\n" 95 "b_dirty: %8lu\n"
103 "b_io: %8lu\n" 96 "b_io: %8lu\n"
104 "b_more_io: %8lu\n" 97 "b_more_io: %8lu\n"
105 "bdi_list: %8u\n" 98 "bdi_list: %8u\n"
106 "state: %8lx\n" 99 "state: %8lx\n",
107 "wb_list: %8u\n",
108 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 100 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
109 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 101 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
110 K(bdi_thresh), K(dirty_thresh), 102 K(bdi_thresh), K(dirty_thresh),
111 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, 103 K(background_thresh), nr_dirty, nr_io, nr_more_io,
112 !list_empty(&bdi->bdi_list), bdi->state, 104 !list_empty(&bdi->bdi_list), bdi->state);
113 !list_empty(&bdi->wb_list));
114#undef K 105#undef K
115 106
116 return 0; 107 return 0;
@@ -247,89 +238,18 @@ static int __init default_bdi_init(void)
247 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); 238 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
248 BUG_ON(IS_ERR(sync_supers_tsk)); 239 BUG_ON(IS_ERR(sync_supers_tsk));
249 240
250 init_timer(&sync_supers_timer);
251 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); 241 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
252 bdi_arm_supers_timer(); 242 bdi_arm_supers_timer();
253 243
254 err = bdi_init(&default_backing_dev_info); 244 err = bdi_init(&default_backing_dev_info);
255 if (!err) 245 if (!err)
256 bdi_register(&default_backing_dev_info, NULL, "default"); 246 bdi_register(&default_backing_dev_info, NULL, "default");
247 err = bdi_init(&noop_backing_dev_info);
257 248
258 return err; 249 return err;
259} 250}
260subsys_initcall(default_bdi_init); 251subsys_initcall(default_bdi_init);
261 252
262static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
263{
264 memset(wb, 0, sizeof(*wb));
265
266 wb->bdi = bdi;
267 wb->last_old_flush = jiffies;
268 INIT_LIST_HEAD(&wb->b_dirty);
269 INIT_LIST_HEAD(&wb->b_io);
270 INIT_LIST_HEAD(&wb->b_more_io);
271}
272
273static void bdi_task_init(struct backing_dev_info *bdi,
274 struct bdi_writeback *wb)
275{
276 struct task_struct *tsk = current;
277
278 spin_lock(&bdi->wb_lock);
279 list_add_tail_rcu(&wb->list, &bdi->wb_list);
280 spin_unlock(&bdi->wb_lock);
281
282 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
283 set_freezable();
284
285 /*
286 * Our parent may run at a different priority, just set us to normal
287 */
288 set_user_nice(tsk, 0);
289}
290
291static int bdi_start_fn(void *ptr)
292{
293 struct bdi_writeback *wb = ptr;
294 struct backing_dev_info *bdi = wb->bdi;
295 int ret;
296
297 /*
298 * Add us to the active bdi_list
299 */
300 spin_lock_bh(&bdi_lock);
301 list_add_rcu(&bdi->bdi_list, &bdi_list);
302 spin_unlock_bh(&bdi_lock);
303
304 bdi_task_init(bdi, wb);
305
306 /*
307 * Clear pending bit and wakeup anybody waiting to tear us down
308 */
309 clear_bit(BDI_pending, &bdi->state);
310 smp_mb__after_clear_bit();
311 wake_up_bit(&bdi->state, BDI_pending);
312
313 ret = bdi_writeback_task(wb);
314
315 /*
316 * Remove us from the list
317 */
318 spin_lock(&bdi->wb_lock);
319 list_del_rcu(&wb->list);
320 spin_unlock(&bdi->wb_lock);
321
322 /*
323 * Flush any work that raced with us exiting. No new work
324 * will be added, since this bdi isn't discoverable anymore.
325 */
326 if (!list_empty(&bdi->work_list))
327 wb_do_writeback(wb, 1);
328
329 wb->task = NULL;
330 return ret;
331}
332
333int bdi_has_dirty_io(struct backing_dev_info *bdi) 253int bdi_has_dirty_io(struct backing_dev_info *bdi)
334{ 254{
335 return wb_has_dirty_io(&bdi->wb); 255 return wb_has_dirty_io(&bdi->wb);
@@ -348,10 +268,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
348} 268}
349 269
350/* 270/*
351 * kupdated() used to do this. We cannot do it from the bdi_forker_task() 271 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
352 * or we risk deadlocking on ->s_umount. The longer term solution would be 272 * or we risk deadlocking on ->s_umount. The longer term solution would be
353 * to implement sync_supers_bdi() or similar and simply do it from the 273 * to implement sync_supers_bdi() or similar and simply do it from the
354 * bdi writeback tasks individually. 274 * bdi writeback thread individually.
355 */ 275 */
356static int bdi_sync_supers(void *unused) 276static int bdi_sync_supers(void *unused)
357{ 277{
@@ -387,144 +307,201 @@ static void sync_supers_timer_fn(unsigned long unused)
387 bdi_arm_supers_timer(); 307 bdi_arm_supers_timer();
388} 308}
389 309
390static int bdi_forker_task(void *ptr) 310static void wakeup_timer_fn(unsigned long data)
311{
312 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
313
314 spin_lock_bh(&bdi->wb_lock);
315 if (bdi->wb.task) {
316 trace_writeback_wake_thread(bdi);
317 wake_up_process(bdi->wb.task);
318 } else {
319 /*
320 * When bdi tasks are inactive for long time, they are killed.
321 * In this case we have to wake-up the forker thread which
322 * should create and run the bdi thread.
323 */
324 trace_writeback_wake_forker_thread(bdi);
325 wake_up_process(default_backing_dev_info.wb.task);
326 }
327 spin_unlock_bh(&bdi->wb_lock);
328}
329
330/*
331 * This function is used when the first inode for this bdi is marked dirty. It
332 * wakes-up the corresponding bdi thread which should then take care of the
333 * periodic background write-out of dirty inodes. Since the write-out would
334 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
335 * set up a timer which wakes the bdi thread up later.
336 *
337 * Note, we wouldn't bother setting up the timer, but this function is on the
338 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
339 * by delaying the wake-up.
340 */
341void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
342{
343 unsigned long timeout;
344
345 timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
346 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
347}
348
349/*
350 * Calculate the longest interval (jiffies) bdi threads are allowed to be
351 * inactive.
352 */
353static unsigned long bdi_longest_inactive(void)
354{
355 unsigned long interval;
356
357 interval = msecs_to_jiffies(dirty_writeback_interval * 10);
358 return max(5UL * 60 * HZ, interval);
359}
360
361static int bdi_forker_thread(void *ptr)
391{ 362{
392 struct bdi_writeback *me = ptr; 363 struct bdi_writeback *me = ptr;
393 364
394 bdi_task_init(me->bdi, me); 365 current->flags |= PF_SWAPWRITE;
366 set_freezable();
367
368 /*
369 * Our parent may run at a different priority, just set us to normal
370 */
371 set_user_nice(current, 0);
395 372
396 for (;;) { 373 for (;;) {
397 struct backing_dev_info *bdi, *tmp; 374 struct task_struct *task = NULL;
398 struct bdi_writeback *wb; 375 struct backing_dev_info *bdi;
376 enum {
377 NO_ACTION, /* Nothing to do */
378 FORK_THREAD, /* Fork bdi thread */
379 KILL_THREAD, /* Kill inactive bdi thread */
380 } action = NO_ACTION;
399 381
400 /* 382 /*
401 * Temporary measure, we want to make sure we don't see 383 * Temporary measure, we want to make sure we don't see
402 * dirty data on the default backing_dev_info 384 * dirty data on the default backing_dev_info
403 */ 385 */
404 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) 386 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
387 del_timer(&me->wakeup_timer);
405 wb_do_writeback(me, 0); 388 wb_do_writeback(me, 0);
389 }
406 390
407 spin_lock_bh(&bdi_lock); 391 spin_lock_bh(&bdi_lock);
392 set_current_state(TASK_INTERRUPTIBLE);
408 393
409 /* 394 list_for_each_entry(bdi, &bdi_list, bdi_list) {
410 * Check if any existing bdi's have dirty data without 395 bool have_dirty_io;
411 * a thread registered. If so, set that up. 396
412 */ 397 if (!bdi_cap_writeback_dirty(bdi) ||
413 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { 398 bdi_cap_flush_forker(bdi))
414 if (bdi->wb.task)
415 continue;
416 if (list_empty(&bdi->work_list) &&
417 !bdi_has_dirty_io(bdi))
418 continue; 399 continue;
419 400
420 bdi_add_default_flusher_task(bdi); 401 WARN(!test_bit(BDI_registered, &bdi->state),
421 } 402 "bdi %p/%s is not registered!\n", bdi, bdi->name);
422 403
423 set_current_state(TASK_INTERRUPTIBLE); 404 have_dirty_io = !list_empty(&bdi->work_list) ||
405 wb_has_dirty_io(&bdi->wb);
424 406
425 if (list_empty(&bdi_pending_list)) { 407 /*
426 unsigned long wait; 408 * If the bdi has work to do, but the thread does not
409 * exist - create it.
410 */
411 if (!bdi->wb.task && have_dirty_io) {
412 /*
413 * Set the pending bit - if someone will try to
414 * unregister this bdi - it'll wait on this bit.
415 */
416 set_bit(BDI_pending, &bdi->state);
417 action = FORK_THREAD;
418 break;
419 }
420
421 spin_lock(&bdi->wb_lock);
427 422
428 spin_unlock_bh(&bdi_lock); 423 /*
429 wait = msecs_to_jiffies(dirty_writeback_interval * 10); 424 * If there is no work to do and the bdi thread was
430 if (wait) 425 * inactive long enough - kill it. The wb_lock is taken
431 schedule_timeout(wait); 426 * to make sure no-one adds more work to this bdi and
427 * wakes the bdi thread up.
428 */
429 if (bdi->wb.task && !have_dirty_io &&
430 time_after(jiffies, bdi->wb.last_active +
431 bdi_longest_inactive())) {
432 task = bdi->wb.task;
433 bdi->wb.task = NULL;
434 spin_unlock(&bdi->wb_lock);
435 set_bit(BDI_pending, &bdi->state);
436 action = KILL_THREAD;
437 break;
438 }
439 spin_unlock(&bdi->wb_lock);
440 }
441 spin_unlock_bh(&bdi_lock);
442
443 /* Keep working if default bdi still has things to do */
444 if (!list_empty(&me->bdi->work_list))
445 __set_current_state(TASK_RUNNING);
446
447 switch (action) {
448 case FORK_THREAD:
449 __set_current_state(TASK_RUNNING);
450 task = kthread_create(bdi_writeback_thread, &bdi->wb,
451 "flush-%s", dev_name(bdi->dev));
452 if (IS_ERR(task)) {
453 /*
454 * If thread creation fails, force writeout of
455 * the bdi from the thread.
456 */
457 bdi_flush_io(bdi);
458 } else {
459 /*
460 * The spinlock makes sure we do not lose
461 * wake-ups when racing with 'bdi_queue_work()'.
462 * And as soon as the bdi thread is visible, we
463 * can start it.
464 */
465 spin_lock_bh(&bdi->wb_lock);
466 bdi->wb.task = task;
467 spin_unlock_bh(&bdi->wb_lock);
468 wake_up_process(task);
469 }
470 break;
471
472 case KILL_THREAD:
473 __set_current_state(TASK_RUNNING);
474 kthread_stop(task);
475 break;
476
477 case NO_ACTION:
478 if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
479 /*
480 * There are no dirty data. The only thing we
481 * should now care about is checking for
482 * inactive bdi threads and killing them. Thus,
483 * let's sleep for longer time, save energy and
484 * be friendly for battery-driven devices.
485 */
486 schedule_timeout(bdi_longest_inactive());
432 else 487 else
433 schedule(); 488 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
434 try_to_freeze(); 489 try_to_freeze();
490 /* Back to the main loop */
435 continue; 491 continue;
436 } 492 }
437 493
438 __set_current_state(TASK_RUNNING);
439
440 /* 494 /*
441 * This is our real job - check for pending entries in 495 * Clear pending bit and wakeup anybody waiting to tear us down.
442 * bdi_pending_list, and create the tasks that got added
443 */ 496 */
444 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, 497 clear_bit(BDI_pending, &bdi->state);
445 bdi_list); 498 smp_mb__after_clear_bit();
446 list_del_init(&bdi->bdi_list); 499 wake_up_bit(&bdi->state, BDI_pending);
447 spin_unlock_bh(&bdi_lock);
448
449 wb = &bdi->wb;
450 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
451 dev_name(bdi->dev));
452 /*
453 * If task creation fails, then readd the bdi to
454 * the pending list and force writeout of the bdi
455 * from this forker thread. That will free some memory
456 * and we can try again.
457 */
458 if (IS_ERR(wb->task)) {
459 wb->task = NULL;
460
461 /*
462 * Add this 'bdi' to the back, so we get
463 * a chance to flush other bdi's to free
464 * memory.
465 */
466 spin_lock_bh(&bdi_lock);
467 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
468 spin_unlock_bh(&bdi_lock);
469
470 bdi_flush_io(bdi);
471 }
472 } 500 }
473 501
474 return 0; 502 return 0;
475} 503}
476 504
477static void bdi_add_to_pending(struct rcu_head *head)
478{
479 struct backing_dev_info *bdi;
480
481 bdi = container_of(head, struct backing_dev_info, rcu_head);
482 INIT_LIST_HEAD(&bdi->bdi_list);
483
484 spin_lock(&bdi_lock);
485 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
486 spin_unlock(&bdi_lock);
487
488 /*
489 * We are now on the pending list, wake up bdi_forker_task()
490 * to finish the job and add us back to the active bdi_list
491 */
492 wake_up_process(default_backing_dev_info.wb.task);
493}
494
495/*
496 * Add the default flusher task that gets created for any bdi
497 * that has dirty data pending writeout
498 */
499void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
500{
501 if (!bdi_cap_writeback_dirty(bdi))
502 return;
503
504 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
505 printk(KERN_ERR "bdi %p/%s is not registered!\n",
506 bdi, bdi->name);
507 return;
508 }
509
510 /*
511 * Check with the helper whether to proceed adding a task. Will only
512 * abort if we two or more simultanous calls to
513 * bdi_add_default_flusher_task() occured, further additions will block
514 * waiting for previous additions to finish.
515 */
516 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
517 list_del_rcu(&bdi->bdi_list);
518
519 /*
520 * We must wait for the current RCU period to end before
521 * moving to the pending list. So schedule that operation
522 * from an RCU callback.
523 */
524 call_rcu(&bdi->rcu_head, bdi_add_to_pending);
525 }
526}
527
528/* 505/*
529 * Remove bdi from bdi_list, and ensure that it is no longer visible 506 * Remove bdi from bdi_list, and ensure that it is no longer visible
530 */ 507 */
@@ -541,23 +518,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
541 const char *fmt, ...) 518 const char *fmt, ...)
542{ 519{
543 va_list args; 520 va_list args;
544 int ret = 0;
545 struct device *dev; 521 struct device *dev;
546 522
547 if (bdi->dev) /* The driver needs to use separate queues per device */ 523 if (bdi->dev) /* The driver needs to use separate queues per device */
548 goto exit; 524 return 0;
549 525
550 va_start(args, fmt); 526 va_start(args, fmt);
551 dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); 527 dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
552 va_end(args); 528 va_end(args);
553 if (IS_ERR(dev)) { 529 if (IS_ERR(dev))
554 ret = PTR_ERR(dev); 530 return PTR_ERR(dev);
555 goto exit;
556 }
557
558 spin_lock_bh(&bdi_lock);
559 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
560 spin_unlock_bh(&bdi_lock);
561 531
562 bdi->dev = dev; 532 bdi->dev = dev;
563 533
@@ -569,21 +539,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
569 if (bdi_cap_flush_forker(bdi)) { 539 if (bdi_cap_flush_forker(bdi)) {
570 struct bdi_writeback *wb = &bdi->wb; 540 struct bdi_writeback *wb = &bdi->wb;
571 541
572 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", 542 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
573 dev_name(dev)); 543 dev_name(dev));
574 if (IS_ERR(wb->task)) { 544 if (IS_ERR(wb->task))
575 wb->task = NULL; 545 return PTR_ERR(wb->task);
576 ret = -ENOMEM;
577
578 bdi_remove_from_list(bdi);
579 goto exit;
580 }
581 } 546 }
582 547
583 bdi_debug_register(bdi, dev_name(dev)); 548 bdi_debug_register(bdi, dev_name(dev));
584 set_bit(BDI_registered, &bdi->state); 549 set_bit(BDI_registered, &bdi->state);
585exit: 550
586 return ret; 551 spin_lock_bh(&bdi_lock);
552 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
553 spin_unlock_bh(&bdi_lock);
554
555 trace_writeback_bdi_register(bdi);
556 return 0;
587} 557}
588EXPORT_SYMBOL(bdi_register); 558EXPORT_SYMBOL(bdi_register);
589 559
@@ -598,31 +568,29 @@ EXPORT_SYMBOL(bdi_register_dev);
598 */ 568 */
599static void bdi_wb_shutdown(struct backing_dev_info *bdi) 569static void bdi_wb_shutdown(struct backing_dev_info *bdi)
600{ 570{
601 struct bdi_writeback *wb;
602
603 if (!bdi_cap_writeback_dirty(bdi)) 571 if (!bdi_cap_writeback_dirty(bdi))
604 return; 572 return;
605 573
606 /* 574 /*
607 * If setup is pending, wait for that to complete first 575 * Make sure nobody finds us on the bdi_list anymore
608 */ 576 */
609 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 577 bdi_remove_from_list(bdi);
610 TASK_UNINTERRUPTIBLE);
611 578
612 /* 579 /*
613 * Make sure nobody finds us on the bdi_list anymore 580 * If setup is pending, wait for that to complete first
614 */ 581 */
615 bdi_remove_from_list(bdi); 582 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
583 TASK_UNINTERRUPTIBLE);
616 584
617 /* 585 /*
618 * Finally, kill the kernel threads. We don't need to be RCU 586 * Finally, kill the kernel thread. We don't need to be RCU
619 * safe anymore, since the bdi is gone from visibility. Force 587 * safe anymore, since the bdi is gone from visibility. Force
620 * unfreeze of the thread before calling kthread_stop(), otherwise 588 * unfreeze of the thread before calling kthread_stop(), otherwise
621 * it would never exet if it is currently stuck in the refrigerator. 589 * it would never exet if it is currently stuck in the refrigerator.
622 */ 590 */
623 list_for_each_entry(wb, &bdi->wb_list, list) { 591 if (bdi->wb.task) {
624 thaw_process(wb->task); 592 thaw_process(bdi->wb.task);
625 kthread_stop(wb->task); 593 kthread_stop(bdi->wb.task);
626 } 594 }
627} 595}
628 596
@@ -644,7 +612,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
644void bdi_unregister(struct backing_dev_info *bdi) 612void bdi_unregister(struct backing_dev_info *bdi)
645{ 613{
646 if (bdi->dev) { 614 if (bdi->dev) {
615 trace_writeback_bdi_unregister(bdi);
647 bdi_prune_sb(bdi); 616 bdi_prune_sb(bdi);
617 del_timer_sync(&bdi->wb.wakeup_timer);
648 618
649 if (!bdi_cap_flush_forker(bdi)) 619 if (!bdi_cap_flush_forker(bdi))
650 bdi_wb_shutdown(bdi); 620 bdi_wb_shutdown(bdi);
@@ -655,6 +625,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
655} 625}
656EXPORT_SYMBOL(bdi_unregister); 626EXPORT_SYMBOL(bdi_unregister);
657 627
628static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
629{
630 memset(wb, 0, sizeof(*wb));
631
632 wb->bdi = bdi;
633 wb->last_old_flush = jiffies;
634 INIT_LIST_HEAD(&wb->b_dirty);
635 INIT_LIST_HEAD(&wb->b_io);
636 INIT_LIST_HEAD(&wb->b_more_io);
637 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
638}
639
658int bdi_init(struct backing_dev_info *bdi) 640int bdi_init(struct backing_dev_info *bdi)
659{ 641{
660 int i, err; 642 int i, err;
@@ -665,9 +647,7 @@ int bdi_init(struct backing_dev_info *bdi)
665 bdi->max_ratio = 100; 647 bdi->max_ratio = 100;
666 bdi->max_prop_frac = PROP_FRAC_BASE; 648 bdi->max_prop_frac = PROP_FRAC_BASE;
667 spin_lock_init(&bdi->wb_lock); 649 spin_lock_init(&bdi->wb_lock);
668 INIT_RCU_HEAD(&bdi->rcu_head);
669 INIT_LIST_HEAD(&bdi->bdi_list); 650 INIT_LIST_HEAD(&bdi->bdi_list);
670 INIT_LIST_HEAD(&bdi->wb_list);
671 INIT_LIST_HEAD(&bdi->work_list); 651 INIT_LIST_HEAD(&bdi->work_list);
672 652
673 bdi_wb_init(&bdi->wb, bdi); 653 bdi_wb_init(&bdi->wb, bdi);
@@ -749,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
749 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
750 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
751 }; 731 };
732static atomic_t nr_bdi_congested[2];
752 733
753void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 734void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
754{ 735{
@@ -756,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
756 wait_queue_head_t *wqh = &congestion_wqh[sync]; 737 wait_queue_head_t *wqh = &congestion_wqh[sync];
757 738
758 bit = sync ? BDI_sync_congested : BDI_async_congested; 739 bit = sync ? BDI_sync_congested : BDI_async_congested;
759 clear_bit(bit, &bdi->state); 740 if (test_and_clear_bit(bit, &bdi->state))
741 atomic_dec(&nr_bdi_congested[sync]);
760 smp_mb__after_clear_bit(); 742 smp_mb__after_clear_bit();
761 if (waitqueue_active(wqh)) 743 if (waitqueue_active(wqh))
762 wake_up(wqh); 744 wake_up(wqh);
@@ -768,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
768 enum bdi_state bit; 750 enum bdi_state bit;
769 751
770 bit = sync ? BDI_sync_congested : BDI_async_congested; 752 bit = sync ? BDI_sync_congested : BDI_async_congested;
771 set_bit(bit, &bdi->state); 753 if (!test_and_set_bit(bit, &bdi->state))
754 atomic_inc(&nr_bdi_congested[sync]);
772} 755}
773EXPORT_SYMBOL(set_bdi_congested); 756EXPORT_SYMBOL(set_bdi_congested);
774 757
@@ -784,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
784long congestion_wait(int sync, long timeout) 767long congestion_wait(int sync, long timeout)
785{ 768{
786 long ret; 769 long ret;
770 unsigned long start = jiffies;
787 DEFINE_WAIT(wait); 771 DEFINE_WAIT(wait);
788 wait_queue_head_t *wqh = &congestion_wqh[sync]; 772 wait_queue_head_t *wqh = &congestion_wqh[sync];
789 773
790 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 774 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
791 ret = io_schedule_timeout(timeout); 775 ret = io_schedule_timeout(timeout);
792 finish_wait(wqh, &wait); 776 finish_wait(wqh, &wait);
777
778 trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
779 jiffies_to_usecs(jiffies - start));
780
793 return ret; 781 return ret;
794} 782}
795EXPORT_SYMBOL(congestion_wait); 783EXPORT_SYMBOL(congestion_wait);
796 784
785/**
786 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
787 * @zone: A zone to check if it is heavily congested
788 * @sync: SYNC or ASYNC IO
789 * @timeout: timeout in jiffies
790 *
791 * In the event of a congested backing_dev (any backing_dev) and the given
792 * @zone has experienced recent congestion, this waits for up to @timeout
793 * jiffies for either a BDI to exit congestion of the given @sync queue
794 * or a write to complete.
795 *
796 * In the absense of zone congestion, cond_resched() is called to yield
797 * the processor if necessary but otherwise does not sleep.
798 *
799 * The return value is 0 if the sleep is for the full timeout. Otherwise,
800 * it is the number of jiffies that were still remaining when the function
801 * returned. return_value == timeout implies the function did not sleep.
802 */
803long wait_iff_congested(struct zone *zone, int sync, long timeout)
804{
805 long ret;
806 unsigned long start = jiffies;
807 DEFINE_WAIT(wait);
808 wait_queue_head_t *wqh = &congestion_wqh[sync];
809
810 /*
811 * If there is no congestion, or heavy congestion is not being
812 * encountered in the current zone, yield if necessary instead
813 * of sleeping on the congestion queue
814 */
815 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
816 !zone_is_reclaim_congested(zone)) {
817 cond_resched();
818
819 /* In case we scheduled, work out time remaining */
820 ret = timeout - (jiffies - start);
821 if (ret < 0)
822 ret = 0;
823
824 goto out;
825 }
826
827 /* Sleep until uncongested or a write happens */
828 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
829 ret = io_schedule_timeout(timeout);
830 finish_wait(wqh, &wait);
831
832out:
833 trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
834 jiffies_to_usecs(jiffies - start));
835
836 return ret;
837}
838EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a5499..13b0caa9793 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h>
18 19
19#include <asm/bug.h> 20#include <asm/bug.h>
20#include <asm/io.h> 21#include <asm/io.h>
@@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
434 unsigned long size) 435 unsigned long size)
435{ 436{
436#ifdef CONFIG_NO_BOOTMEM 437#ifdef CONFIG_NO_BOOTMEM
437 free_early(physaddr, physaddr + size); 438 kmemleak_free_part(__va(physaddr), size);
439 memblock_x86_free_range(physaddr, physaddr + size);
438#else 440#else
439 unsigned long start, end; 441 unsigned long start, end;
440 442
@@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
459void __init free_bootmem(unsigned long addr, unsigned long size) 461void __init free_bootmem(unsigned long addr, unsigned long size)
460{ 462{
461#ifdef CONFIG_NO_BOOTMEM 463#ifdef CONFIG_NO_BOOTMEM
462 free_early(addr, addr + size); 464 kmemleak_free_part(__va(addr), size);
465 memblock_x86_free_range(addr, addr + size);
463#else 466#else
464 unsigned long start, end; 467 unsigned long start, end;
465 468
@@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
526} 529}
527 530
528#ifndef CONFIG_NO_BOOTMEM 531#ifndef CONFIG_NO_BOOTMEM
532int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
533 int flags)
534{
535 return reserve_bootmem(phys, len, flags);
536}
537
529static unsigned long __init align_idx(struct bootmem_data *bdata, 538static unsigned long __init align_idx(struct bootmem_data *bdata,
530 unsigned long idx, unsigned long step) 539 unsigned long idx, unsigned long step)
531{ 540{
diff --git a/mm/bounce.c b/mm/bounce.c
index 13b6dad1eed..1481de68184 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
116 */ 116 */
117 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; 117 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
118 118
119 flush_dcache_page(tovec->bv_page);
120 bounce_copy_vec(tovec, vfrom); 119 bounce_copy_vec(tovec, vfrom);
120 flush_dcache_page(tovec->bv_page);
121 } 121 }
122} 122}
123 123
diff --git a/mm/compaction.c b/mm/compaction.c
index 94cce51b0b3..4d709ee5901 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
214/* Similar to reclaim, but different enough that they don't share logic */ 214/* Similar to reclaim, but different enough that they don't share logic */
215static bool too_many_isolated(struct zone *zone) 215static bool too_many_isolated(struct zone *zone)
216{ 216{
217 217 unsigned long active, inactive, isolated;
218 unsigned long inactive, isolated;
219 218
220 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 219 inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
221 zone_page_state(zone, NR_INACTIVE_ANON); 220 zone_page_state(zone, NR_INACTIVE_ANON);
221 active = zone_page_state(zone, NR_ACTIVE_FILE) +
222 zone_page_state(zone, NR_ACTIVE_ANON);
222 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 223 isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
223 zone_page_state(zone, NR_ISOLATED_ANON); 224 zone_page_state(zone, NR_ISOLATED_ANON);
224 225
225 return isolated > inactive; 226 return isolated > (inactive + active) / 2;
226} 227}
227 228
228/* 229/*
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f5..4df2de77e06 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
311 size_t offset; 311 size_t offset;
312 void *retval; 312 void *retval;
313 313
314 might_sleep_if(mem_flags & __GFP_WAIT);
315
314 spin_lock_irqsave(&pool->lock, flags); 316 spin_lock_irqsave(&pool->lock, flags);
315 restart: 317 restart:
316 list_for_each_entry(page, &pool->page_list, page_list) { 318 list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9..75572b5f237 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page)
612 TASK_UNINTERRUPTIBLE); 612 TASK_UNINTERRUPTIBLE);
613} 613}
614 614
615int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
616 unsigned int flags)
617{
618 if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
619 __lock_page(page);
620 return 1;
621 } else {
622 up_read(&mm->mmap_sem);
623 wait_on_page_locked(page);
624 return 0;
625 }
626}
627
615/** 628/**
616 * find_get_page - find and get a page reference 629 * find_get_page - find and get a page reference
617 * @mapping: the address_space to search 630 * @mapping: the address_space to search
@@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1539 * waiting for the lock. 1552 * waiting for the lock.
1540 */ 1553 */
1541 do_async_mmap_readahead(vma, ra, file, page, offset); 1554 do_async_mmap_readahead(vma, ra, file, page, offset);
1542 lock_page(page);
1543
1544 /* Did it get truncated? */
1545 if (unlikely(page->mapping != mapping)) {
1546 unlock_page(page);
1547 put_page(page);
1548 goto no_cached_page;
1549 }
1550 } else { 1555 } else {
1551 /* No page in the page cache at all */ 1556 /* No page in the page cache at all */
1552 do_sync_mmap_readahead(vma, ra, file, offset); 1557 do_sync_mmap_readahead(vma, ra, file, offset);
1553 count_vm_event(PGMAJFAULT); 1558 count_vm_event(PGMAJFAULT);
1554 ret = VM_FAULT_MAJOR; 1559 ret = VM_FAULT_MAJOR;
1555retry_find: 1560retry_find:
1556 page = find_lock_page(mapping, offset); 1561 page = find_get_page(mapping, offset);
1557 if (!page) 1562 if (!page)
1558 goto no_cached_page; 1563 goto no_cached_page;
1559 } 1564 }
1560 1565
1566 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
1567 return ret | VM_FAULT_RETRY;
1568
1569 /* Did it get truncated? */
1570 if (unlikely(page->mapping != mapping)) {
1571 unlock_page(page);
1572 put_page(page);
1573 goto retry_find;
1574 }
1575 VM_BUG_ON(page->index != offset);
1576
1561 /* 1577 /*
1562 * We have a locked page in the page cache, now we need to check 1578 * We have a locked page in the page cache, now we need to check
1563 * that it's up-to-date. If not, it is going to be due to an error. 1579 * that it's up-to-date. If not, it is going to be due to an error.
@@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2177 } 2193 }
2178 2194
2179 if (written > 0) { 2195 if (written > 0) {
2180 loff_t end = pos + written; 2196 pos += written;
2181 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2197 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2182 i_size_write(inode, end); 2198 i_size_write(inode, pos);
2183 mark_inode_dirty(inode); 2199 mark_inode_dirty(inode);
2184 } 2200 }
2185 *ppos = end; 2201 *ppos = pos;
2186 } 2202 }
2187out: 2203out:
2188 return written; 2204 return written;
@@ -2238,14 +2254,12 @@ static ssize_t generic_perform_write(struct file *file,
2238 2254
2239 do { 2255 do {
2240 struct page *page; 2256 struct page *page;
2241 pgoff_t index; /* Pagecache index for current page */
2242 unsigned long offset; /* Offset into pagecache page */ 2257 unsigned long offset; /* Offset into pagecache page */
2243 unsigned long bytes; /* Bytes to write to page */ 2258 unsigned long bytes; /* Bytes to write to page */
2244 size_t copied; /* Bytes copied from user */ 2259 size_t copied; /* Bytes copied from user */
2245 void *fsdata; 2260 void *fsdata;
2246 2261
2247 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2262 offset = (pos & (PAGE_CACHE_SIZE - 1));
2248 index = pos >> PAGE_CACHE_SHIFT;
2249 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2263 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2250 iov_iter_count(i)); 2264 iov_iter_count(i));
2251 2265
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dacf90a..ec520c7b28d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
125{ 125{
126 struct mm_struct *mm = current->mm; 126 struct mm_struct *mm = current->mm;
127 struct address_space *mapping; 127 struct address_space *mapping;
128 unsigned long end = start + size;
129 struct vm_area_struct *vma; 128 struct vm_area_struct *vma;
130 int err = -EINVAL; 129 int err = -EINVAL;
131 int has_write_lock = 0; 130 int has_write_lock = 0;
@@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
142 if (start + size <= start) 141 if (start + size <= start)
143 return err; 142 return err;
144 143
144 /* Does pgoff wrap? */
145 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
146 return err;
147
145 /* Can we represent this offset inside this architecture's pte's? */ 148 /* Can we represent this offset inside this architecture's pte's? */
146#if PTE_FILE_MAX_BITS < BITS_PER_LONG 149#if PTE_FILE_MAX_BITS < BITS_PER_LONG
147 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) 150 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
168 if (!(vma->vm_flags & VM_CAN_NONLINEAR)) 171 if (!(vma->vm_flags & VM_CAN_NONLINEAR))
169 goto out; 172 goto out;
170 173
171 if (end <= start || start < vma->vm_start || end > vma->vm_end) 174 if (start < vma->vm_start || start + size > vma->vm_end)
172 goto out; 175 goto out;
173 176
174 /* Must set VM_NONLINEAR before any pages are populated. */ 177 /* Must set VM_NONLINEAR before any pages are populated. */
diff --git a/mm/highmem.c b/mm/highmem.c
index 66baa20f78f..693394daa2e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,8 +26,14 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/kgdb.h>
29#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
30 31
32
33#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34DEFINE_PER_CPU(int, __kmap_atomic_idx);
35#endif
36
31/* 37/*
32 * Virtual_count is not a pure "count". 38 * Virtual_count is not a pure "count".
33 * 0 means that it is not mapped, and has not been mapped 39 * 0 means that it is not mapped, and has not been mapped
@@ -41,6 +47,9 @@
41unsigned long totalhigh_pages __read_mostly; 47unsigned long totalhigh_pages __read_mostly;
42EXPORT_SYMBOL(totalhigh_pages); 48EXPORT_SYMBOL(totalhigh_pages);
43 49
50
51EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
52
44unsigned int nr_free_highpages (void) 53unsigned int nr_free_highpages (void)
45{ 54{
46 pg_data_t *pgdat; 55 pg_data_t *pgdat;
@@ -421,55 +430,3 @@ void __init page_address_init(void)
421} 430}
422 431
423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 432#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
424
425#ifdef CONFIG_DEBUG_HIGHMEM
426
427void debug_kmap_atomic(enum km_type type)
428{
429 static int warn_count = 10;
430
431 if (unlikely(warn_count < 0))
432 return;
433
434 if (unlikely(in_interrupt())) {
435 if (in_nmi()) {
436 if (type != KM_NMI && type != KM_NMI_PTE) {
437 WARN_ON(1);
438 warn_count--;
439 }
440 } else if (in_irq()) {
441 if (type != KM_IRQ0 && type != KM_IRQ1 &&
442 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
443 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
444 WARN_ON(1);
445 warn_count--;
446 }
447 } else if (!irqs_disabled()) { /* softirq */
448 if (type != KM_IRQ0 && type != KM_IRQ1 &&
449 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
450 type != KM_SKB_SUNRPC_DATA &&
451 type != KM_SKB_DATA_SOFTIRQ &&
452 type != KM_BOUNCE_READ) {
453 WARN_ON(1);
454 warn_count--;
455 }
456 }
457 }
458
459 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
460 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
461 type == KM_IRQ_PTE || type == KM_NMI ||
462 type == KM_NMI_PTE ) {
463 if (!irqs_disabled()) {
464 WARN_ON(1);
465 warn_count--;
466 }
467 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
468 if (irq_count() == 0 && !irqs_disabled()) {
469 WARN_ON(1);
470 warn_count--;
471 }
472 }
473}
474
475#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009db..c4a3558589a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rmap.h>
22#include <linux/swap.h>
23#include <linux/swapops.h>
21 24
22#include <asm/page.h> 25#include <asm/page.h>
23#include <asm/pgtable.h> 26#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220 (vma->vm_pgoff >> huge_page_order(h)); 223 (vma->vm_pgoff >> huge_page_order(h));
221} 224}
222 225
226pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
227 unsigned long address)
228{
229 return vma_hugecache_offset(hstate_vma(vma), vma, address);
230}
231
223/* 232/*
224 * Return the size of the pages allocated when backing a VMA. In the majority 233 * Return the size of the pages allocated when backing a VMA. In the majority
225 * cases this will be same size as used by the page table entries. 234 * cases this will be same size as used by the page table entries.
@@ -414,14 +423,14 @@ static void clear_huge_page(struct page *page,
414 } 423 }
415} 424}
416 425
417static void copy_gigantic_page(struct page *dst, struct page *src, 426static void copy_user_gigantic_page(struct page *dst, struct page *src,
418 unsigned long addr, struct vm_area_struct *vma) 427 unsigned long addr, struct vm_area_struct *vma)
419{ 428{
420 int i; 429 int i;
421 struct hstate *h = hstate_vma(vma); 430 struct hstate *h = hstate_vma(vma);
422 struct page *dst_base = dst; 431 struct page *dst_base = dst;
423 struct page *src_base = src; 432 struct page *src_base = src;
424 might_sleep(); 433
425 for (i = 0; i < pages_per_huge_page(h); ) { 434 for (i = 0; i < pages_per_huge_page(h); ) {
426 cond_resched(); 435 cond_resched();
427 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -431,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
431 src = mem_map_next(src, src_base, i); 440 src = mem_map_next(src, src_base, i);
432 } 441 }
433} 442}
434static void copy_huge_page(struct page *dst, struct page *src, 443
444static void copy_user_huge_page(struct page *dst, struct page *src,
435 unsigned long addr, struct vm_area_struct *vma) 445 unsigned long addr, struct vm_area_struct *vma)
436{ 446{
437 int i; 447 int i;
438 struct hstate *h = hstate_vma(vma); 448 struct hstate *h = hstate_vma(vma);
439 449
440 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
441 copy_gigantic_page(dst, src, addr, vma); 451 copy_user_gigantic_page(dst, src, addr, vma);
442 return; 452 return;
443 } 453 }
444 454
@@ -449,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
449 } 459 }
450} 460}
451 461
462static void copy_gigantic_page(struct page *dst, struct page *src)
463{
464 int i;
465 struct hstate *h = page_hstate(src);
466 struct page *dst_base = dst;
467 struct page *src_base = src;
468
469 for (i = 0; i < pages_per_huge_page(h); ) {
470 cond_resched();
471 copy_highpage(dst, src);
472
473 i++;
474 dst = mem_map_next(dst, dst_base, i);
475 src = mem_map_next(src, src_base, i);
476 }
477}
478
479void copy_huge_page(struct page *dst, struct page *src)
480{
481 int i;
482 struct hstate *h = page_hstate(src);
483
484 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
485 copy_gigantic_page(dst, src);
486 return;
487 }
488
489 might_sleep();
490 for (i = 0; i < pages_per_huge_page(h); i++) {
491 cond_resched();
492 copy_highpage(dst + i, src + i);
493 }
494}
495
452static void enqueue_huge_page(struct hstate *h, struct page *page) 496static void enqueue_huge_page(struct hstate *h, struct page *page)
453{ 497{
454 int nid = page_to_nid(page); 498 int nid = page_to_nid(page);
@@ -457,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
457 h->free_huge_pages_node[nid]++; 501 h->free_huge_pages_node[nid]++;
458} 502}
459 503
504static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
505{
506 struct page *page;
507
508 if (list_empty(&h->hugepage_freelists[nid]))
509 return NULL;
510 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
511 list_del(&page->lru);
512 set_page_refcounted(page);
513 h->free_huge_pages--;
514 h->free_huge_pages_node[nid]--;
515 return page;
516}
517
460static struct page *dequeue_huge_page_vma(struct hstate *h, 518static struct page *dequeue_huge_page_vma(struct hstate *h,
461 struct vm_area_struct *vma, 519 struct vm_area_struct *vma,
462 unsigned long address, int avoid_reserve) 520 unsigned long address, int avoid_reserve)
463{ 521{
464 int nid;
465 struct page *page = NULL; 522 struct page *page = NULL;
466 struct mempolicy *mpol; 523 struct mempolicy *mpol;
467 nodemask_t *nodemask; 524 nodemask_t *nodemask;
@@ -487,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
487 544
488 for_each_zone_zonelist_nodemask(zone, z, zonelist, 545 for_each_zone_zonelist_nodemask(zone, z, zonelist,
489 MAX_NR_ZONES - 1, nodemask) { 546 MAX_NR_ZONES - 1, nodemask) {
490 nid = zone_to_nid(zone); 547 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
491 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 548 page = dequeue_huge_page_node(h, zone_to_nid(zone));
492 !list_empty(&h->hugepage_freelists[nid])) { 549 if (page) {
493 page = list_entry(h->hugepage_freelists[nid].next, 550 if (!avoid_reserve)
494 struct page, lru); 551 decrement_hugepage_resv_vma(h, vma);
495 list_del(&page->lru); 552 break;
496 h->free_huge_pages--; 553 }
497 h->free_huge_pages_node[nid]--;
498
499 if (!avoid_reserve)
500 decrement_hugepage_resv_vma(h, vma);
501
502 break;
503 } 554 }
504 } 555 }
505err: 556err:
@@ -552,6 +603,7 @@ static void free_huge_page(struct page *page)
552 set_page_private(page, 0); 603 set_page_private(page, 0);
553 page->mapping = NULL; 604 page->mapping = NULL;
554 BUG_ON(page_count(page)); 605 BUG_ON(page_count(page));
606 BUG_ON(page_mapcount(page));
555 INIT_LIST_HEAD(&page->lru); 607 INIT_LIST_HEAD(&page->lru);
556 608
557 spin_lock(&hugetlb_lock); 609 spin_lock(&hugetlb_lock);
@@ -605,6 +657,8 @@ int PageHuge(struct page *page)
605 return dtor == free_huge_page; 657 return dtor == free_huge_page;
606} 658}
607 659
660EXPORT_SYMBOL_GPL(PageHuge);
661
608static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 662static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
609{ 663{
610 struct page *page; 664 struct page *page;
@@ -758,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
758 return ret; 812 return ret;
759} 813}
760 814
761static struct page *alloc_buddy_huge_page(struct hstate *h, 815static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
762 struct vm_area_struct *vma, unsigned long address)
763{ 816{
764 struct page *page; 817 struct page *page;
765 unsigned int nid; 818 unsigned int r_nid;
766 819
767 if (h->order >= MAX_ORDER) 820 if (h->order >= MAX_ORDER)
768 return NULL; 821 return NULL;
@@ -800,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
800 } 853 }
801 spin_unlock(&hugetlb_lock); 854 spin_unlock(&hugetlb_lock);
802 855
803 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 856 if (nid == NUMA_NO_NODE)
804 __GFP_REPEAT|__GFP_NOWARN, 857 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
805 huge_page_order(h)); 858 __GFP_REPEAT|__GFP_NOWARN,
859 huge_page_order(h));
860 else
861 page = alloc_pages_exact_node(nid,
862 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
863 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
806 864
807 if (page && arch_prepare_hugepage(page)) { 865 if (page && arch_prepare_hugepage(page)) {
808 __free_pages(page, huge_page_order(h)); 866 __free_pages(page, huge_page_order(h));
@@ -811,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
811 869
812 spin_lock(&hugetlb_lock); 870 spin_lock(&hugetlb_lock);
813 if (page) { 871 if (page) {
814 /* 872 r_nid = page_to_nid(page);
815 * This page is now managed by the hugetlb allocator and has
816 * no users -- drop the buddy allocator's reference.
817 */
818 put_page_testzero(page);
819 VM_BUG_ON(page_count(page));
820 nid = page_to_nid(page);
821 set_compound_page_dtor(page, free_huge_page); 873 set_compound_page_dtor(page, free_huge_page);
822 /* 874 /*
823 * We incremented the global counters already 875 * We incremented the global counters already
824 */ 876 */
825 h->nr_huge_pages_node[nid]++; 877 h->nr_huge_pages_node[r_nid]++;
826 h->surplus_huge_pages_node[nid]++; 878 h->surplus_huge_pages_node[r_nid]++;
827 __count_vm_event(HTLB_BUDDY_PGALLOC); 879 __count_vm_event(HTLB_BUDDY_PGALLOC);
828 } else { 880 } else {
829 h->nr_huge_pages--; 881 h->nr_huge_pages--;
@@ -836,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
836} 888}
837 889
838/* 890/*
891 * This allocation function is useful in the context where vma is irrelevant.
892 * E.g. soft-offlining uses this function because it only cares physical
893 * address of error page.
894 */
895struct page *alloc_huge_page_node(struct hstate *h, int nid)
896{
897 struct page *page;
898
899 spin_lock(&hugetlb_lock);
900 page = dequeue_huge_page_node(h, nid);
901 spin_unlock(&hugetlb_lock);
902
903 if (!page)
904 page = alloc_buddy_huge_page(h, nid);
905
906 return page;
907}
908
909/*
839 * Increase the hugetlb pool such that it can accomodate a reservation 910 * Increase the hugetlb pool such that it can accomodate a reservation
840 * of size 'delta'. 911 * of size 'delta'.
841 */ 912 */
@@ -859,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
859retry: 930retry:
860 spin_unlock(&hugetlb_lock); 931 spin_unlock(&hugetlb_lock);
861 for (i = 0; i < needed; i++) { 932 for (i = 0; i < needed; i++) {
862 page = alloc_buddy_huge_page(h, NULL, 0); 933 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
863 if (!page) { 934 if (!page)
864 /* 935 /*
865 * We were not able to allocate enough pages to 936 * We were not able to allocate enough pages to
866 * satisfy the entire reservation so we free what 937 * satisfy the entire reservation so we free what
867 * we've allocated so far. 938 * we've allocated so far.
868 */ 939 */
869 spin_lock(&hugetlb_lock);
870 needed = 0;
871 goto free; 940 goto free;
872 }
873 941
874 list_add(&page->lru, &surplus_list); 942 list_add(&page->lru, &surplus_list);
875 } 943 }
@@ -896,31 +964,31 @@ retry:
896 needed += allocated; 964 needed += allocated;
897 h->resv_huge_pages += delta; 965 h->resv_huge_pages += delta;
898 ret = 0; 966 ret = 0;
899free: 967
968 spin_unlock(&hugetlb_lock);
900 /* Free the needed pages to the hugetlb pool */ 969 /* Free the needed pages to the hugetlb pool */
901 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 970 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
902 if ((--needed) < 0) 971 if ((--needed) < 0)
903 break; 972 break;
904 list_del(&page->lru); 973 list_del(&page->lru);
974 /*
975 * This page is now managed by the hugetlb allocator and has
976 * no users -- drop the buddy allocator's reference.
977 */
978 put_page_testzero(page);
979 VM_BUG_ON(page_count(page));
905 enqueue_huge_page(h, page); 980 enqueue_huge_page(h, page);
906 } 981 }
907 982
908 /* Free unnecessary surplus pages to the buddy allocator */ 983 /* Free unnecessary surplus pages to the buddy allocator */
984free:
909 if (!list_empty(&surplus_list)) { 985 if (!list_empty(&surplus_list)) {
910 spin_unlock(&hugetlb_lock);
911 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 986 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
912 list_del(&page->lru); 987 list_del(&page->lru);
913 /* 988 put_page(page);
914 * The page has a reference count of zero already, so
915 * call free_huge_page directly instead of using
916 * put_page. This must be done with hugetlb_lock
917 * unlocked which is safe because free_huge_page takes
918 * hugetlb_lock before deciding how to free the page.
919 */
920 free_huge_page(page);
921 } 989 }
922 spin_lock(&hugetlb_lock);
923 } 990 }
991 spin_lock(&hugetlb_lock);
924 992
925 return ret; 993 return ret;
926} 994}
@@ -1040,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1040 spin_unlock(&hugetlb_lock); 1108 spin_unlock(&hugetlb_lock);
1041 1109
1042 if (!page) { 1110 if (!page) {
1043 page = alloc_buddy_huge_page(h, vma, addr); 1111 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1044 if (!page) { 1112 if (!page) {
1045 hugetlb_put_quota(inode->i_mapping, chg); 1113 hugetlb_put_quota(inode->i_mapping, chg);
1046 return ERR_PTR(-VM_FAULT_SIGBUS); 1114 return ERR_PTR(-VM_FAULT_SIGBUS);
1047 } 1115 }
1048 } 1116 }
1049 1117
1050 set_page_refcounted(page);
1051 set_page_private(page, (unsigned long) mapping); 1118 set_page_private(page, (unsigned long) mapping);
1052 1119
1053 vma_commit_reservation(h, vma, addr); 1120 vma_commit_reservation(h, vma, addr);
@@ -2129,6 +2196,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2129 entry = huge_ptep_get(src_pte); 2196 entry = huge_ptep_get(src_pte);
2130 ptepage = pte_page(entry); 2197 ptepage = pte_page(entry);
2131 get_page(ptepage); 2198 get_page(ptepage);
2199 page_dup_rmap(ptepage);
2132 set_huge_pte_at(dst, addr, dst_pte, entry); 2200 set_huge_pte_at(dst, addr, dst_pte, entry);
2133 } 2201 }
2134 spin_unlock(&src->page_table_lock); 2202 spin_unlock(&src->page_table_lock);
@@ -2140,6 +2208,32 @@ nomem:
2140 return -ENOMEM; 2208 return -ENOMEM;
2141} 2209}
2142 2210
2211static int is_hugetlb_entry_migration(pte_t pte)
2212{
2213 swp_entry_t swp;
2214
2215 if (huge_pte_none(pte) || pte_present(pte))
2216 return 0;
2217 swp = pte_to_swp_entry(pte);
2218 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2219 return 1;
2220 } else
2221 return 0;
2222}
2223
2224static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2225{
2226 swp_entry_t swp;
2227
2228 if (huge_pte_none(pte) || pte_present(pte))
2229 return 0;
2230 swp = pte_to_swp_entry(pte);
2231 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
2232 return 1;
2233 } else
2234 return 0;
2235}
2236
2143void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2237void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2144 unsigned long end, struct page *ref_page) 2238 unsigned long end, struct page *ref_page)
2145{ 2239{
@@ -2198,6 +2292,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2198 if (huge_pte_none(pte)) 2292 if (huge_pte_none(pte))
2199 continue; 2293 continue;
2200 2294
2295 /*
2296 * HWPoisoned hugepage is already unmapped and dropped reference
2297 */
2298 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2299 continue;
2300
2201 page = pte_page(pte); 2301 page = pte_page(pte);
2202 if (pte_dirty(pte)) 2302 if (pte_dirty(pte))
2203 set_page_dirty(page); 2303 set_page_dirty(page);
@@ -2207,6 +2307,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2207 flush_tlb_range(vma, start, end); 2307 flush_tlb_range(vma, start, end);
2208 mmu_notifier_invalidate_range_end(mm, start, end); 2308 mmu_notifier_invalidate_range_end(mm, start, end);
2209 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2309 list_for_each_entry_safe(page, tmp, &page_list, lru) {
2310 page_remove_rmap(page);
2210 list_del(&page->lru); 2311 list_del(&page->lru);
2211 put_page(page); 2312 put_page(page);
2212 } 2313 }
@@ -2272,6 +2373,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2272 return 1; 2373 return 1;
2273} 2374}
2274 2375
2376/*
2377 * Hugetlb_cow() should be called with page lock of the original hugepage held.
2378 */
2275static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2379static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2276 unsigned long address, pte_t *ptep, pte_t pte, 2380 unsigned long address, pte_t *ptep, pte_t pte,
2277 struct page *pagecache_page) 2381 struct page *pagecache_page)
@@ -2286,8 +2390,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2286retry_avoidcopy: 2390retry_avoidcopy:
2287 /* If no-one else is actually using this page, avoid the copy 2391 /* If no-one else is actually using this page, avoid the copy
2288 * and just make the page writable */ 2392 * and just make the page writable */
2289 avoidcopy = (page_count(old_page) == 1); 2393 avoidcopy = (page_mapcount(old_page) == 1);
2290 if (avoidcopy) { 2394 if (avoidcopy) {
2395 if (PageAnon(old_page))
2396 page_move_anon_rmap(old_page, vma, address);
2291 set_huge_ptep_writable(vma, address, ptep); 2397 set_huge_ptep_writable(vma, address, ptep);
2292 return 0; 2398 return 0;
2293 } 2399 }
@@ -2338,7 +2444,17 @@ retry_avoidcopy:
2338 return -PTR_ERR(new_page); 2444 return -PTR_ERR(new_page);
2339 } 2445 }
2340 2446
2341 copy_huge_page(new_page, old_page, address, vma); 2447 /*
2448 * When the original hugepage is shared one, it does not have
2449 * anon_vma prepared.
2450 */
2451 if (unlikely(anon_vma_prepare(vma))) {
2452 /* Caller expects lock to be held */
2453 spin_lock(&mm->page_table_lock);
2454 return VM_FAULT_OOM;
2455 }
2456
2457 copy_user_huge_page(new_page, old_page, address, vma);
2342 __SetPageUptodate(new_page); 2458 __SetPageUptodate(new_page);
2343 2459
2344 /* 2460 /*
@@ -2349,11 +2465,19 @@ retry_avoidcopy:
2349 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2465 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2350 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2466 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2351 /* Break COW */ 2467 /* Break COW */
2468 mmu_notifier_invalidate_range_start(mm,
2469 address & huge_page_mask(h),
2470 (address & huge_page_mask(h)) + huge_page_size(h));
2352 huge_ptep_clear_flush(vma, address, ptep); 2471 huge_ptep_clear_flush(vma, address, ptep);
2353 set_huge_pte_at(mm, address, ptep, 2472 set_huge_pte_at(mm, address, ptep,
2354 make_huge_pte(vma, new_page, 1)); 2473 make_huge_pte(vma, new_page, 1));
2474 page_remove_rmap(old_page);
2475 hugepage_add_new_anon_rmap(new_page, vma, address);
2355 /* Make the old page be freed below */ 2476 /* Make the old page be freed below */
2356 new_page = old_page; 2477 new_page = old_page;
2478 mmu_notifier_invalidate_range_end(mm,
2479 address & huge_page_mask(h),
2480 (address & huge_page_mask(h)) + huge_page_size(h));
2357 } 2481 }
2358 page_cache_release(new_page); 2482 page_cache_release(new_page);
2359 page_cache_release(old_page); 2483 page_cache_release(old_page);
@@ -2452,10 +2576,27 @@ retry:
2452 spin_lock(&inode->i_lock); 2576 spin_lock(&inode->i_lock);
2453 inode->i_blocks += blocks_per_huge_page(h); 2577 inode->i_blocks += blocks_per_huge_page(h);
2454 spin_unlock(&inode->i_lock); 2578 spin_unlock(&inode->i_lock);
2579 page_dup_rmap(page);
2455 } else { 2580 } else {
2456 lock_page(page); 2581 lock_page(page);
2457 page->mapping = HUGETLB_POISON; 2582 if (unlikely(anon_vma_prepare(vma))) {
2583 ret = VM_FAULT_OOM;
2584 goto backout_unlocked;
2585 }
2586 hugepage_add_new_anon_rmap(page, vma, address);
2458 } 2587 }
2588 } else {
2589 /*
2590 * If memory error occurs between mmap() and fault, some process
2591 * don't have hwpoisoned swap entry for errored virtual address.
2592 * So we need to block hugepage fault by PG_hwpoison bit check.
2593 */
2594 if (unlikely(PageHWPoison(page))) {
2595 ret = VM_FAULT_HWPOISON |
2596 VM_FAULT_SET_HINDEX(h - hstates);
2597 goto backout_unlocked;
2598 }
2599 page_dup_rmap(page);
2459 } 2600 }
2460 2601
2461 /* 2602 /*
@@ -2507,10 +2648,22 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2507 pte_t *ptep; 2648 pte_t *ptep;
2508 pte_t entry; 2649 pte_t entry;
2509 int ret; 2650 int ret;
2651 struct page *page = NULL;
2510 struct page *pagecache_page = NULL; 2652 struct page *pagecache_page = NULL;
2511 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2653 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2512 struct hstate *h = hstate_vma(vma); 2654 struct hstate *h = hstate_vma(vma);
2513 2655
2656 ptep = huge_pte_offset(mm, address);
2657 if (ptep) {
2658 entry = huge_ptep_get(ptep);
2659 if (unlikely(is_hugetlb_entry_migration(entry))) {
2660 migration_entry_wait(mm, (pmd_t *)ptep, address);
2661 return 0;
2662 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2663 return VM_FAULT_HWPOISON_LARGE |
2664 VM_FAULT_SET_HINDEX(h - hstates);
2665 }
2666
2514 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2667 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2515 if (!ptep) 2668 if (!ptep)
2516 return VM_FAULT_OOM; 2669 return VM_FAULT_OOM;
@@ -2548,6 +2701,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2548 vma, address); 2701 vma, address);
2549 } 2702 }
2550 2703
2704 /*
2705 * hugetlb_cow() requires page locks of pte_page(entry) and
2706 * pagecache_page, so here we need take the former one
2707 * when page != pagecache_page or !pagecache_page.
2708 * Note that locking order is always pagecache_page -> page,
2709 * so no worry about deadlock.
2710 */
2711 page = pte_page(entry);
2712 if (page != pagecache_page)
2713 lock_page(page);
2714
2551 spin_lock(&mm->page_table_lock); 2715 spin_lock(&mm->page_table_lock);
2552 /* Check for a racing update before calling hugetlb_cow */ 2716 /* Check for a racing update before calling hugetlb_cow */
2553 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2717 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2574,6 +2738,7 @@ out_page_table_lock:
2574 unlock_page(pagecache_page); 2738 unlock_page(pagecache_page);
2575 put_page(pagecache_page); 2739 put_page(pagecache_page);
2576 } 2740 }
2741 unlock_page(page);
2577 2742
2578out_mutex: 2743out_mutex:
2579 mutex_unlock(&hugetlb_instantiation_mutex); 2744 mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2785,3 +2950,42 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2785 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2950 hugetlb_put_quota(inode->i_mapping, (chg - freed));
2786 hugetlb_acct_memory(h, -(chg - freed)); 2951 hugetlb_acct_memory(h, -(chg - freed));
2787} 2952}
2953
2954#ifdef CONFIG_MEMORY_FAILURE
2955
2956/* Should be called in hugetlb_lock */
2957static int is_hugepage_on_freelist(struct page *hpage)
2958{
2959 struct page *page;
2960 struct page *tmp;
2961 struct hstate *h = page_hstate(hpage);
2962 int nid = page_to_nid(hpage);
2963
2964 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
2965 if (page == hpage)
2966 return 1;
2967 return 0;
2968}
2969
2970/*
2971 * This function is called from memory failure code.
2972 * Assume the caller holds page lock of the head page.
2973 */
2974int dequeue_hwpoisoned_huge_page(struct page *hpage)
2975{
2976 struct hstate *h = page_hstate(hpage);
2977 int nid = page_to_nid(hpage);
2978 int ret = -EBUSY;
2979
2980 spin_lock(&hugetlb_lock);
2981 if (is_hugepage_on_freelist(hpage)) {
2982 list_del(&hpage->lru);
2983 set_page_refcounted(hpage);
2984 h->free_huge_pages--;
2985 h->free_huge_pages_node[nid]--;
2986 ret = 0;
2987 }
2988 spin_unlock(&hugetlb_lock);
2989 return ret;
2990}
2991#endif
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1..0948f1072d6 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h> 6#include <linux/swap.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/hugetlb.h>
8#include "internal.h" 9#include "internal.h"
9 10
10static struct dentry *hwpoison_dir; 11static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
13{ 14{
14 unsigned long pfn = val; 15 unsigned long pfn = val;
15 struct page *p; 16 struct page *p;
17 struct page *hpage;
16 int err; 18 int err;
17 19
18 if (!capable(CAP_SYS_ADMIN)) 20 if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
24 return -ENXIO; 26 return -ENXIO;
25 27
26 p = pfn_to_page(pfn); 28 p = pfn_to_page(pfn);
29 hpage = compound_head(p);
27 /* 30 /*
28 * This implies unable to support free buddy pages. 31 * This implies unable to support free buddy pages.
29 */ 32 */
30 if (!get_page_unless_zero(p)) 33 if (!get_page_unless_zero(hpage))
31 return 0; 34 return 0;
32 35
33 if (!PageLRU(p)) 36 if (!PageLRU(p) && !PageHuge(p))
34 shake_page(p, 0); 37 shake_page(p, 0);
35 /* 38 /*
36 * This implies unable to support non-LRU pages. 39 * This implies unable to support non-LRU pages.
37 */ 40 */
38 if (!PageLRU(p)) 41 if (!PageLRU(p) && !PageHuge(p))
39 return 0; 42 return 0;
40 43
41 /* 44 /*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
44 * We temporarily take page lock for try_get_mem_cgroup_from_page(). 47 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock. 48 * __memory_failure() will redo the check reliably inside page lock.
46 */ 49 */
47 lock_page(p); 50 lock_page(hpage);
48 err = hwpoison_filter(p); 51 err = hwpoison_filter(hpage);
49 unlock_page(p); 52 unlock_page(hpage);
50 if (err) 53 if (err)
51 return 0; 54 return 0;
52 55
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da966..1d29cdfe8eb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
7 7
8#include <asm/atomic.h> 8#include <asm/atomic.h>
9#include <asm/pgtable.h> 9#include <asm/pgtable.h>
10#include <asm/mmu.h>
11
12#ifndef INIT_MM_CONTEXT
13#define INIT_MM_CONTEXT(name)
14#endif
10 15
11struct mm_struct init_mm = { 16struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT, 17 .mm_rb = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 22 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 23 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL, 24 .cpu_vm_mask = CPU_MASK_ALL,
25 INIT_MM_CONTEXT(init_mm)
20}; 26};
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc..dedb0aff673 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
62 */ 62 */
63static inline unsigned long page_order(struct page *page) 63static inline unsigned long page_order(struct page *page)
64{ 64{
65 VM_BUG_ON(!PageBuddy(page)); 65 /* PageBuddy() must be checked by the caller */
66 return page_private(page); 66 return page_private(page);
67} 67}
68 68
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2c0d032ac89..bd9bc214091 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -211,6 +211,9 @@ static signed long jiffies_scan_wait;
211static int kmemleak_stack_scan = 1; 211static int kmemleak_stack_scan = 1;
212/* protects the memory scanning, parameters and debug/kmemleak file access */ 212/* protects the memory scanning, parameters and debug/kmemleak file access */
213static DEFINE_MUTEX(scan_mutex); 213static DEFINE_MUTEX(scan_mutex);
214/* setting kmemleak=on, will set this var, skipping the disable */
215static int kmemleak_skip_disable;
216
214 217
215/* 218/*
216 * Early object allocation/freeing logging. Kmemleak is initialized after the 219 * Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -398,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
398 object = prio_tree_entry(node, struct kmemleak_object, 401 object = prio_tree_entry(node, struct kmemleak_object,
399 tree_node); 402 tree_node);
400 if (!alias && object->pointer != ptr) { 403 if (!alias && object->pointer != ptr) {
401 kmemleak_warn("Found object by alias"); 404 pr_warning("Found object by alias at 0x%08lx\n", ptr);
405 dump_stack();
406 dump_object_info(object);
402 object = NULL; 407 object = NULL;
403 } 408 }
404 } else 409 } else
@@ -695,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color)
695} 700}
696 701
697/* 702/*
698 * Make a object permanently as gray-colored so that it can no longer be 703 * Mark an object permanently as gray-colored so that it can no longer be
699 * reported as a leak. This is used in general to mark a false positive. 704 * reported as a leak. This is used in general to mark a false positive.
700 */ 705 */
701static void make_gray_object(unsigned long ptr) 706static void make_gray_object(unsigned long ptr)
@@ -838,10 +843,19 @@ out:
838 rcu_read_unlock(); 843 rcu_read_unlock();
839} 844}
840 845
841/* 846/**
842 * Memory allocation function callback. This function is called from the 847 * kmemleak_alloc - register a newly allocated object
843 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 848 * @ptr: pointer to beginning of the object
844 * vmalloc etc.). 849 * @size: size of the object
850 * @min_count: minimum number of references to this object. If during memory
851 * scanning a number of references less than @min_count is found,
852 * the object is reported as a memory leak. If @min_count is 0,
853 * the object is never reported as a leak. If @min_count is -1,
854 * the object is ignored (not scanned and not reported as a leak)
855 * @gfp: kmalloc() flags used for kmemleak internal memory allocations
856 *
857 * This function is called from the kernel allocators when a new object
858 * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
845 */ 859 */
846void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, 860void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
847 gfp_t gfp) 861 gfp_t gfp)
@@ -855,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
855} 869}
856EXPORT_SYMBOL_GPL(kmemleak_alloc); 870EXPORT_SYMBOL_GPL(kmemleak_alloc);
857 871
858/* 872/**
859 * Memory freeing function callback. This function is called from the kernel 873 * kmemleak_free - unregister a previously registered object
860 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 874 * @ptr: pointer to beginning of the object
875 *
876 * This function is called from the kernel allocators when an object (memory
877 * block) is freed (kmem_cache_free, kfree, vfree etc.).
861 */ 878 */
862void __ref kmemleak_free(const void *ptr) 879void __ref kmemleak_free(const void *ptr)
863{ 880{
@@ -870,9 +887,14 @@ void __ref kmemleak_free(const void *ptr)
870} 887}
871EXPORT_SYMBOL_GPL(kmemleak_free); 888EXPORT_SYMBOL_GPL(kmemleak_free);
872 889
873/* 890/**
874 * Partial memory freeing function callback. This function is usually called 891 * kmemleak_free_part - partially unregister a previously registered object
875 * from bootmem allocator when (part of) a memory block is freed. 892 * @ptr: pointer to the beginning or inside the object. This also
893 * represents the start of the range to be freed
894 * @size: size to be unregistered
895 *
896 * This function is called when only a part of a memory block is freed
897 * (usually from the bootmem allocator).
876 */ 898 */
877void __ref kmemleak_free_part(const void *ptr, size_t size) 899void __ref kmemleak_free_part(const void *ptr, size_t size)
878{ 900{
@@ -885,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
885} 907}
886EXPORT_SYMBOL_GPL(kmemleak_free_part); 908EXPORT_SYMBOL_GPL(kmemleak_free_part);
887 909
888/* 910/**
889 * Mark an already allocated memory block as a false positive. This will cause 911 * kmemleak_not_leak - mark an allocated object as false positive
890 * the block to no longer be reported as leak and always be scanned. 912 * @ptr: pointer to beginning of the object
913 *
914 * Calling this function on an object will cause the memory block to no longer
915 * be reported as leak and always be scanned.
891 */ 916 */
892void __ref kmemleak_not_leak(const void *ptr) 917void __ref kmemleak_not_leak(const void *ptr)
893{ 918{
@@ -900,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr)
900} 925}
901EXPORT_SYMBOL(kmemleak_not_leak); 926EXPORT_SYMBOL(kmemleak_not_leak);
902 927
903/* 928/**
904 * Ignore a memory block. This is usually done when it is known that the 929 * kmemleak_ignore - ignore an allocated object
905 * corresponding block is not a leak and does not contain any references to 930 * @ptr: pointer to beginning of the object
906 * other allocated memory blocks. 931 *
932 * Calling this function on an object will cause the memory block to be
933 * ignored (not scanned and not reported as a leak). This is usually done when
934 * it is known that the corresponding block is not a leak and does not contain
935 * any references to other allocated memory blocks.
907 */ 936 */
908void __ref kmemleak_ignore(const void *ptr) 937void __ref kmemleak_ignore(const void *ptr)
909{ 938{
@@ -916,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr)
916} 945}
917EXPORT_SYMBOL(kmemleak_ignore); 946EXPORT_SYMBOL(kmemleak_ignore);
918 947
919/* 948/**
920 * Limit the range to be scanned in an allocated memory block. 949 * kmemleak_scan_area - limit the range to be scanned in an allocated object
950 * @ptr: pointer to beginning or inside the object. This also
951 * represents the start of the scan area
952 * @size: size of the scan area
953 * @gfp: kmalloc() flags used for kmemleak internal memory allocations
954 *
955 * This function is used when it is known that only certain parts of an object
956 * contain references to other objects. Kmemleak will only scan these areas
957 * reducing the number false negatives.
921 */ 958 */
922void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) 959void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
923{ 960{
@@ -930,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
930} 967}
931EXPORT_SYMBOL(kmemleak_scan_area); 968EXPORT_SYMBOL(kmemleak_scan_area);
932 969
933/* 970/**
934 * Inform kmemleak not to scan the given memory block. 971 * kmemleak_no_scan - do not scan an allocated object
972 * @ptr: pointer to beginning of the object
973 *
974 * This function notifies kmemleak not to scan the given memory block. Useful
975 * in situations where it is known that the given object does not contain any
976 * references to other objects. Kmemleak will not scan such objects reducing
977 * the number of false negatives.
935 */ 978 */
936void __ref kmemleak_no_scan(const void *ptr) 979void __ref kmemleak_no_scan(const void *ptr)
937{ 980{
@@ -1602,7 +1645,9 @@ static int kmemleak_boot_config(char *str)
1602 return -EINVAL; 1645 return -EINVAL;
1603 if (strcmp(str, "off") == 0) 1646 if (strcmp(str, "off") == 0)
1604 kmemleak_disable(); 1647 kmemleak_disable();
1605 else if (strcmp(str, "on") != 0) 1648 else if (strcmp(str, "on") == 0)
1649 kmemleak_skip_disable = 1;
1650 else
1606 return -EINVAL; 1651 return -EINVAL;
1607 return 0; 1652 return 0;
1608} 1653}
@@ -1616,6 +1661,13 @@ void __init kmemleak_init(void)
1616 int i; 1661 int i;
1617 unsigned long flags; 1662 unsigned long flags;
1618 1663
1664#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
1665 if (!kmemleak_skip_disable) {
1666 kmemleak_disable();
1667 return;
1668 }
1669#endif
1670
1619 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); 1671 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
1620 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); 1672 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
1621 1673
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7..65ab5c7067d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h>
36 37
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include "internal.h" 39#include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
153static struct rb_root root_stable_tree = RB_ROOT; 154static struct rb_root root_stable_tree = RB_ROOT;
154static struct rb_root root_unstable_tree = RB_ROOT; 155static struct rb_root root_unstable_tree = RB_ROOT;
155 156
156#define MM_SLOTS_HASH_HEADS 1024 157#define MM_SLOTS_HASH_SHIFT 10
157static struct hlist_head *mm_slots_hash; 158#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
159static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
158 160
159static struct mm_slot ksm_mm_head = { 161static struct mm_slot ksm_mm_head = {
160 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 162 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
269 kmem_cache_free(mm_slot_cache, mm_slot); 271 kmem_cache_free(mm_slot_cache, mm_slot);
270} 272}
271 273
272static int __init mm_slots_hash_init(void)
273{
274 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
275 GFP_KERNEL);
276 if (!mm_slots_hash)
277 return -ENOMEM;
278 return 0;
279}
280
281static void __init mm_slots_hash_free(void)
282{
283 kfree(mm_slots_hash);
284}
285
286static struct mm_slot *get_mm_slot(struct mm_struct *mm) 274static struct mm_slot *get_mm_slot(struct mm_struct *mm)
287{ 275{
288 struct mm_slot *mm_slot; 276 struct mm_slot *mm_slot;
289 struct hlist_head *bucket; 277 struct hlist_head *bucket;
290 struct hlist_node *node; 278 struct hlist_node *node;
291 279
292 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 280 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
293 % MM_SLOTS_HASH_HEADS];
294 hlist_for_each_entry(mm_slot, node, bucket, link) { 281 hlist_for_each_entry(mm_slot, node, bucket, link) {
295 if (mm == mm_slot->mm) 282 if (mm == mm_slot->mm)
296 return mm_slot; 283 return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
303{ 290{
304 struct hlist_head *bucket; 291 struct hlist_head *bucket;
305 292
306 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 293 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
307 % MM_SLOTS_HASH_HEADS];
308 mm_slot->mm = mm; 294 mm_slot->mm = mm;
309 hlist_add_head(&mm_slot->link, bucket); 295 hlist_add_head(&mm_slot->link, bucket);
310} 296}
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma) 304 struct anon_vma *anon_vma)
319{ 305{
320 rmap_item->anon_vma = anon_vma; 306 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->external_refcount); 307 get_anon_vma(anon_vma);
322} 308}
323 309
324static void drop_anon_vma(struct rmap_item *rmap_item) 310static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
325{ 311{
326 struct anon_vma *anon_vma = rmap_item->anon_vma; 312 struct anon_vma *anon_vma = rmap_item->anon_vma;
327 313
328 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { 314 drop_anon_vma(anon_vma);
329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock);
331 if (empty)
332 anon_vma_free(anon_vma);
333 }
334} 315}
335 316
336/* 317/*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
415 * It is not an accident that whenever we want to break COW 396 * It is not an accident that whenever we want to break COW
416 * to undo, we also need to drop a reference to the anon_vma. 397 * to undo, we also need to drop a reference to the anon_vma.
417 */ 398 */
418 drop_anon_vma(rmap_item); 399 ksm_drop_anon_vma(rmap_item);
419 400
420 down_read(&mm->mmap_sem); 401 down_read(&mm->mmap_sem);
421 if (ksm_test_exit(mm)) 402 if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
470 ksm_pages_sharing--; 451 ksm_pages_sharing--;
471 else 452 else
472 ksm_pages_shared--; 453 ksm_pages_shared--;
473 drop_anon_vma(rmap_item); 454 ksm_drop_anon_vma(rmap_item);
474 rmap_item->address &= PAGE_MASK; 455 rmap_item->address &= PAGE_MASK;
475 cond_resched(); 456 cond_resched();
476 } 457 }
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
558 else 539 else
559 ksm_pages_shared--; 540 ksm_pages_shared--;
560 541
561 drop_anon_vma(rmap_item); 542 ksm_drop_anon_vma(rmap_item);
562 rmap_item->address &= PAGE_MASK; 543 rmap_item->address &= PAGE_MASK;
563 544
564 } else if (rmap_item->address & UNSTABLE_FLAG) { 545 } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -731,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
731 if (!ptep) 712 if (!ptep)
732 goto out; 713 goto out;
733 714
734 if (pte_write(*ptep)) { 715 if (pte_write(*ptep) || pte_dirty(*ptep)) {
735 pte_t entry; 716 pte_t entry;
736 717
737 swapped = PageSwapCache(page); 718 swapped = PageSwapCache(page);
@@ -754,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
754 set_pte_at(mm, addr, ptep, entry); 735 set_pte_at(mm, addr, ptep, entry);
755 goto out_unlock; 736 goto out_unlock;
756 } 737 }
757 entry = pte_wrprotect(entry); 738 if (pte_dirty(entry))
739 set_page_dirty(page);
740 entry = pte_mkclean(pte_wrprotect(entry));
758 set_pte_at_notify(mm, addr, ptep, entry); 741 set_pte_at_notify(mm, addr, ptep, entry);
759 } 742 }
760 *orig_pte = *ptep; 743 *orig_pte = *ptep;
@@ -1523,8 +1506,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1523{ 1506{
1524 struct page *new_page; 1507 struct page *new_page;
1525 1508
1526 unlock_page(page); /* any racers will COW it, not modify it */
1527
1528 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1509 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1529 if (new_page) { 1510 if (new_page) {
1530 copy_user_highpage(new_page, page, address, vma); 1511 copy_user_highpage(new_page, page, address, vma);
@@ -1540,7 +1521,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1540 add_page_to_unevictable_list(new_page); 1521 add_page_to_unevictable_list(new_page);
1541 } 1522 }
1542 1523
1543 page_cache_release(page);
1544 return new_page; 1524 return new_page;
1545} 1525}
1546 1526
@@ -1566,7 +1546,7 @@ again:
1566 struct anon_vma_chain *vmac; 1546 struct anon_vma_chain *vmac;
1567 struct vm_area_struct *vma; 1547 struct vm_area_struct *vma;
1568 1548
1569 spin_lock(&anon_vma->lock); 1549 anon_vma_lock(anon_vma);
1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1550 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma; 1551 vma = vmac->vma;
1572 if (rmap_item->address < vma->vm_start || 1552 if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1569,7 @@ again:
1589 if (!search_new_forks || !mapcount) 1569 if (!search_new_forks || !mapcount)
1590 break; 1570 break;
1591 } 1571 }
1592 spin_unlock(&anon_vma->lock); 1572 anon_vma_unlock(anon_vma);
1593 if (!mapcount) 1573 if (!mapcount)
1594 goto out; 1574 goto out;
1595 } 1575 }
@@ -1619,7 +1599,7 @@ again:
1619 struct anon_vma_chain *vmac; 1599 struct anon_vma_chain *vmac;
1620 struct vm_area_struct *vma; 1600 struct vm_area_struct *vma;
1621 1601
1622 spin_lock(&anon_vma->lock); 1602 anon_vma_lock(anon_vma);
1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1603 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma; 1604 vma = vmac->vma;
1625 if (rmap_item->address < vma->vm_start || 1605 if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1617,11 @@ again:
1637 ret = try_to_unmap_one(page, vma, 1617 ret = try_to_unmap_one(page, vma,
1638 rmap_item->address, flags); 1618 rmap_item->address, flags);
1639 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1619 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1640 spin_unlock(&anon_vma->lock); 1620 anon_vma_unlock(anon_vma);
1641 goto out; 1621 goto out;
1642 } 1622 }
1643 } 1623 }
1644 spin_unlock(&anon_vma->lock); 1624 anon_vma_unlock(anon_vma);
1645 } 1625 }
1646 if (!search_new_forks++) 1626 if (!search_new_forks++)
1647 goto again; 1627 goto again;
@@ -1671,7 +1651,7 @@ again:
1671 struct anon_vma_chain *vmac; 1651 struct anon_vma_chain *vmac;
1672 struct vm_area_struct *vma; 1652 struct vm_area_struct *vma;
1673 1653
1674 spin_lock(&anon_vma->lock); 1654 anon_vma_lock(anon_vma);
1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1655 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma; 1656 vma = vmac->vma;
1677 if (rmap_item->address < vma->vm_start || 1657 if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1668,11 @@ again:
1688 1668
1689 ret = rmap_one(page, vma, rmap_item->address, arg); 1669 ret = rmap_one(page, vma, rmap_item->address, arg);
1690 if (ret != SWAP_AGAIN) { 1670 if (ret != SWAP_AGAIN) {
1691 spin_unlock(&anon_vma->lock); 1671 anon_vma_unlock(anon_vma);
1692 goto out; 1672 goto out;
1693 } 1673 }
1694 } 1674 }
1695 spin_unlock(&anon_vma->lock); 1675 anon_vma_unlock(anon_vma);
1696 } 1676 }
1697 if (!search_new_forks++) 1677 if (!search_new_forks++)
1698 goto again; 1678 goto again;
@@ -1943,15 +1923,11 @@ static int __init ksm_init(void)
1943 if (err) 1923 if (err)
1944 goto out; 1924 goto out;
1945 1925
1946 err = mm_slots_hash_init();
1947 if (err)
1948 goto out_free1;
1949
1950 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 1926 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1951 if (IS_ERR(ksm_thread)) { 1927 if (IS_ERR(ksm_thread)) {
1952 printk(KERN_ERR "ksm: creating kthread failed\n"); 1928 printk(KERN_ERR "ksm: creating kthread failed\n");
1953 err = PTR_ERR(ksm_thread); 1929 err = PTR_ERR(ksm_thread);
1954 goto out_free2; 1930 goto out_free;
1955 } 1931 }
1956 1932
1957#ifdef CONFIG_SYSFS 1933#ifdef CONFIG_SYSFS
@@ -1959,7 +1935,7 @@ static int __init ksm_init(void)
1959 if (err) { 1935 if (err) {
1960 printk(KERN_ERR "ksm: register sysfs failed\n"); 1936 printk(KERN_ERR "ksm: register sysfs failed\n");
1961 kthread_stop(ksm_thread); 1937 kthread_stop(ksm_thread);
1962 goto out_free2; 1938 goto out_free;
1963 } 1939 }
1964#else 1940#else
1965 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 1941 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
@@ -1975,9 +1951,7 @@ static int __init ksm_init(void)
1975#endif 1951#endif
1976 return 0; 1952 return 0;
1977 1953
1978out_free2: 1954out_free:
1979 mm_slots_hash_free();
1980out_free1:
1981 ksm_slab_free(); 1955 ksm_slab_free();
1982out: 1956out:
1983 return err; 1957 return err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd..e2b6f5634e0 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
1/* 1/*
2 * Access kernel memory without faulting. 2 * Access kernel memory without faulting.
3 */ 3 */
4#include <linux/uaccess.h>
5#include <linux/module.h> 4#include <linux/module.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/uaccess.h>
7 7
8/** 8/**
9 * probe_kernel_read(): safely attempt to read from a location 9 * probe_kernel_read(): safely attempt to read from a location
diff --git a/mm/memblock.c b/mm/memblock.c
index 3024eb30fc2..400dc62697d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,237 +11,423 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/poison.h>
18#include <linux/pfn.h>
19#include <linux/debugfs.h>
20#include <linux/seq_file.h>
16#include <linux/memblock.h> 21#include <linux/memblock.h>
17 22
18#define MEMBLOCK_ALLOC_ANYWHERE 0 23struct memblock memblock __initdata_memblock;
19 24
20struct memblock memblock; 25int memblock_debug __initdata_memblock;
26int memblock_can_resize __initdata_memblock;
27static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
28static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
21 29
22static int memblock_debug; 30/* inline so we don't get a warning when pr_debug is compiled out */
31static inline const char *memblock_type_name(struct memblock_type *type)
32{
33 if (type == &memblock.memory)
34 return "memory";
35 else if (type == &memblock.reserved)
36 return "reserved";
37 else
38 return "unknown";
39}
23 40
24static int __init early_memblock(char *p) 41/*
42 * Address comparison utilities
43 */
44
45static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
25{ 46{
26 if (p && strstr(p, "debug")) 47 return addr & ~(size - 1);
27 memblock_debug = 1; 48}
49
50static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
51{
52 return (addr + (size - 1)) & ~(size - 1);
53}
54
55static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
56 phys_addr_t base2, phys_addr_t size2)
57{
58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
59}
60
61static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
62 phys_addr_t base2, phys_addr_t size2)
63{
64 if (base2 == base1 + size1)
65 return 1;
66 else if (base1 == base2 + size2)
67 return -1;
68
28 return 0; 69 return 0;
29} 70}
30early_param("memblock", early_memblock);
31 71
32static void memblock_dump(struct memblock_region *region, char *name) 72static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
73 unsigned long r1, unsigned long r2)
33{ 74{
34 unsigned long long base, size; 75 phys_addr_t base1 = type->regions[r1].base;
35 int i; 76 phys_addr_t size1 = type->regions[r1].size;
77 phys_addr_t base2 = type->regions[r2].base;
78 phys_addr_t size2 = type->regions[r2].size;
36 79
37 pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); 80 return memblock_addrs_adjacent(base1, size1, base2, size2);
81}
38 82
39 for (i = 0; i < region->cnt; i++) { 83long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
40 base = region->region[i].base; 84{
41 size = region->region[i].size; 85 unsigned long i;
42 86
43 pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", 87 for (i = 0; i < type->cnt; i++) {
44 name, i, base, base + size - 1, size); 88 phys_addr_t rgnbase = type->regions[i].base;
89 phys_addr_t rgnsize = type->regions[i].size;
90 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
91 break;
45 } 92 }
93
94 return (i < type->cnt) ? i : -1;
46} 95}
47 96
48void memblock_dump_all(void) 97/*
98 * Find, allocate, deallocate or reserve unreserved regions. All allocations
99 * are top-down.
100 */
101
102static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
103 phys_addr_t size, phys_addr_t align)
49{ 104{
50 if (!memblock_debug) 105 phys_addr_t base, res_base;
51 return; 106 long j;
52 107
53 pr_info("MEMBLOCK configuration:\n"); 108 /* In case, huge size is requested */
54 pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size); 109 if (end < size)
55 pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size); 110 return MEMBLOCK_ERROR;
56 111
57 memblock_dump(&memblock.memory, "memory"); 112 base = memblock_align_down((end - size), align);
58 memblock_dump(&memblock.reserved, "reserved"); 113
114 /* Prevent allocations returning 0 as it's also used to
115 * indicate an allocation failure
116 */
117 if (start == 0)
118 start = PAGE_SIZE;
119
120 while (start <= base) {
121 j = memblock_overlaps_region(&memblock.reserved, base, size);
122 if (j < 0)
123 return base;
124 res_base = memblock.reserved.regions[j].base;
125 if (res_base < size)
126 break;
127 base = memblock_align_down(res_base - size, align);
128 }
129
130 return MEMBLOCK_ERROR;
59} 131}
60 132
61static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2, 133static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
62 u64 size2) 134 phys_addr_t align, phys_addr_t start, phys_addr_t end)
63{ 135{
64 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 136 long i;
137
138 BUG_ON(0 == size);
139
140 size = memblock_align_up(size, align);
141
142 /* Pump up max_addr */
143 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
144 end = memblock.current_limit;
145
146 /* We do a top-down search, this tends to limit memory
147 * fragmentation by keeping early boot allocs near the
148 * top of memory
149 */
150 for (i = memblock.memory.cnt - 1; i >= 0; i--) {
151 phys_addr_t memblockbase = memblock.memory.regions[i].base;
152 phys_addr_t memblocksize = memblock.memory.regions[i].size;
153 phys_addr_t bottom, top, found;
154
155 if (memblocksize < size)
156 continue;
157 if ((memblockbase + memblocksize) <= start)
158 break;
159 bottom = max(memblockbase, start);
160 top = min(memblockbase + memblocksize, end);
161 if (bottom >= top)
162 continue;
163 found = memblock_find_region(bottom, top, size, align);
164 if (found != MEMBLOCK_ERROR)
165 return found;
166 }
167 return MEMBLOCK_ERROR;
65} 168}
66 169
67static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) 170/*
171 * Find a free area with specified alignment in a specific range.
172 */
173u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
68{ 174{
69 if (base2 == base1 + size1) 175 return memblock_find_base(size, align, start, end);
70 return 1; 176}
71 else if (base1 == base2 + size2)
72 return -1;
73 177
74 return 0; 178/*
179 * Free memblock.reserved.regions
180 */
181int __init_memblock memblock_free_reserved_regions(void)
182{
183 if (memblock.reserved.regions == memblock_reserved_init_regions)
184 return 0;
185
186 return memblock_free(__pa(memblock.reserved.regions),
187 sizeof(struct memblock_region) * memblock.reserved.max);
75} 188}
76 189
77static long memblock_regions_adjacent(struct memblock_region *rgn, 190/*
78 unsigned long r1, unsigned long r2) 191 * Reserve memblock.reserved.regions
192 */
193int __init_memblock memblock_reserve_reserved_regions(void)
79{ 194{
80 u64 base1 = rgn->region[r1].base; 195 if (memblock.reserved.regions == memblock_reserved_init_regions)
81 u64 size1 = rgn->region[r1].size; 196 return 0;
82 u64 base2 = rgn->region[r2].base;
83 u64 size2 = rgn->region[r2].size;
84 197
85 return memblock_addrs_adjacent(base1, size1, base2, size2); 198 return memblock_reserve(__pa(memblock.reserved.regions),
199 sizeof(struct memblock_region) * memblock.reserved.max);
86} 200}
87 201
88static void memblock_remove_region(struct memblock_region *rgn, unsigned long r) 202static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
89{ 203{
90 unsigned long i; 204 unsigned long i;
91 205
92 for (i = r; i < rgn->cnt - 1; i++) { 206 for (i = r; i < type->cnt - 1; i++) {
93 rgn->region[i].base = rgn->region[i + 1].base; 207 type->regions[i].base = type->regions[i + 1].base;
94 rgn->region[i].size = rgn->region[i + 1].size; 208 type->regions[i].size = type->regions[i + 1].size;
95 } 209 }
96 rgn->cnt--; 210 type->cnt--;
97} 211}
98 212
99/* Assumption: base addr of region 1 < base addr of region 2 */ 213/* Assumption: base addr of region 1 < base addr of region 2 */
100static void memblock_coalesce_regions(struct memblock_region *rgn, 214static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
101 unsigned long r1, unsigned long r2) 215 unsigned long r1, unsigned long r2)
102{ 216{
103 rgn->region[r1].size += rgn->region[r2].size; 217 type->regions[r1].size += type->regions[r2].size;
104 memblock_remove_region(rgn, r2); 218 memblock_remove_region(type, r2);
105} 219}
106 220
107void __init memblock_init(void) 221/* Defined below but needed now */
222static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
223
224static int __init_memblock memblock_double_array(struct memblock_type *type)
108{ 225{
109 /* Create a dummy zero size MEMBLOCK which will get coalesced away later. 226 struct memblock_region *new_array, *old_array;
110 * This simplifies the memblock_add() code below... 227 phys_addr_t old_size, new_size, addr;
228 int use_slab = slab_is_available();
229
230 /* We don't allow resizing until we know about the reserved regions
231 * of memory that aren't suitable for allocation
111 */ 232 */
112 memblock.memory.region[0].base = 0; 233 if (!memblock_can_resize)
113 memblock.memory.region[0].size = 0; 234 return -1;
114 memblock.memory.cnt = 1;
115 235
116 /* Ditto. */ 236 /* Calculate new doubled size */
117 memblock.reserved.region[0].base = 0; 237 old_size = type->max * sizeof(struct memblock_region);
118 memblock.reserved.region[0].size = 0; 238 new_size = old_size << 1;
119 memblock.reserved.cnt = 1; 239
120} 240 /* Try to find some space for it.
241 *
242 * WARNING: We assume that either slab_is_available() and we use it or
243 * we use MEMBLOCK for allocations. That means that this is unsafe to use
244 * when bootmem is currently active (unless bootmem itself is implemented
245 * on top of MEMBLOCK which isn't the case yet)
246 *
247 * This should however not be an issue for now, as we currently only
248 * call into MEMBLOCK while it's still active, or much later when slab is
249 * active for memory hotplug operations
250 */
251 if (use_slab) {
252 new_array = kmalloc(new_size, GFP_KERNEL);
253 addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
254 } else
255 addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
256 if (addr == MEMBLOCK_ERROR) {
257 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
258 memblock_type_name(type), type->max, type->max * 2);
259 return -1;
260 }
261 new_array = __va(addr);
121 262
122void __init memblock_analyze(void) 263 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
123{ 264 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
124 int i; 265
266 /* Found space, we now need to move the array over before
267 * we add the reserved region since it may be our reserved
268 * array itself that is full.
269 */
270 memcpy(new_array, type->regions, old_size);
271 memset(new_array + type->max, 0, old_size);
272 old_array = type->regions;
273 type->regions = new_array;
274 type->max <<= 1;
275
276 /* If we use SLAB that's it, we are done */
277 if (use_slab)
278 return 0;
125 279
126 memblock.memory.size = 0; 280 /* Add the new reserved region now. Should not fail ! */
281 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
127 282
128 for (i = 0; i < memblock.memory.cnt; i++) 283 /* If the array wasn't our static init one, then free it. We only do
129 memblock.memory.size += memblock.memory.region[i].size; 284 * that before SLAB is available as later on, we don't know whether
285 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
286 * anyways
287 */
288 if (old_array != memblock_memory_init_regions &&
289 old_array != memblock_reserved_init_regions)
290 memblock_free(__pa(old_array), old_size);
291
292 return 0;
293}
294
295extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
296 phys_addr_t addr2, phys_addr_t size2)
297{
298 return 1;
130} 299}
131 300
132static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size) 301static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
133{ 302{
134 unsigned long coalesced = 0; 303 unsigned long coalesced = 0;
135 long adjacent, i; 304 long adjacent, i;
136 305
137 if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { 306 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
138 rgn->region[0].base = base; 307 type->regions[0].base = base;
139 rgn->region[0].size = size; 308 type->regions[0].size = size;
140 return 0; 309 return 0;
141 } 310 }
142 311
143 /* First try and coalesce this MEMBLOCK with another. */ 312 /* First try and coalesce this MEMBLOCK with another. */
144 for (i = 0; i < rgn->cnt; i++) { 313 for (i = 0; i < type->cnt; i++) {
145 u64 rgnbase = rgn->region[i].base; 314 phys_addr_t rgnbase = type->regions[i].base;
146 u64 rgnsize = rgn->region[i].size; 315 phys_addr_t rgnsize = type->regions[i].size;
147 316
148 if ((rgnbase == base) && (rgnsize == size)) 317 if ((rgnbase == base) && (rgnsize == size))
149 /* Already have this region, so we're done */ 318 /* Already have this region, so we're done */
150 return 0; 319 return 0;
151 320
152 adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); 321 adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
322 /* Check if arch allows coalescing */
323 if (adjacent != 0 && type == &memblock.memory &&
324 !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
325 break;
153 if (adjacent > 0) { 326 if (adjacent > 0) {
154 rgn->region[i].base -= size; 327 type->regions[i].base -= size;
155 rgn->region[i].size += size; 328 type->regions[i].size += size;
156 coalesced++; 329 coalesced++;
157 break; 330 break;
158 } else if (adjacent < 0) { 331 } else if (adjacent < 0) {
159 rgn->region[i].size += size; 332 type->regions[i].size += size;
160 coalesced++; 333 coalesced++;
161 break; 334 break;
162 } 335 }
163 } 336 }
164 337
165 if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) { 338 /* If we plugged a hole, we may want to also coalesce with the
166 memblock_coalesce_regions(rgn, i, i+1); 339 * next region
340 */
341 if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
342 ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
343 type->regions[i].size,
344 type->regions[i+1].base,
345 type->regions[i+1].size)))) {
346 memblock_coalesce_regions(type, i, i+1);
167 coalesced++; 347 coalesced++;
168 } 348 }
169 349
170 if (coalesced) 350 if (coalesced)
171 return coalesced; 351 return coalesced;
172 if (rgn->cnt >= MAX_MEMBLOCK_REGIONS) 352
353 /* If we are out of space, we fail. It's too late to resize the array
354 * but then this shouldn't have happened in the first place.
355 */
356 if (WARN_ON(type->cnt >= type->max))
173 return -1; 357 return -1;
174 358
175 /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ 359 /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
176 for (i = rgn->cnt - 1; i >= 0; i--) { 360 for (i = type->cnt - 1; i >= 0; i--) {
177 if (base < rgn->region[i].base) { 361 if (base < type->regions[i].base) {
178 rgn->region[i+1].base = rgn->region[i].base; 362 type->regions[i+1].base = type->regions[i].base;
179 rgn->region[i+1].size = rgn->region[i].size; 363 type->regions[i+1].size = type->regions[i].size;
180 } else { 364 } else {
181 rgn->region[i+1].base = base; 365 type->regions[i+1].base = base;
182 rgn->region[i+1].size = size; 366 type->regions[i+1].size = size;
183 break; 367 break;
184 } 368 }
185 } 369 }
186 370
187 if (base < rgn->region[0].base) { 371 if (base < type->regions[0].base) {
188 rgn->region[0].base = base; 372 type->regions[0].base = base;
189 rgn->region[0].size = size; 373 type->regions[0].size = size;
374 }
375 type->cnt++;
376
377 /* The array is full ? Try to resize it. If that fails, we undo
378 * our allocation and return an error
379 */
380 if (type->cnt == type->max && memblock_double_array(type)) {
381 type->cnt--;
382 return -1;
190 } 383 }
191 rgn->cnt++;
192 384
193 return 0; 385 return 0;
194} 386}
195 387
196long memblock_add(u64 base, u64 size) 388long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
197{ 389{
198 struct memblock_region *_rgn = &memblock.memory; 390 return memblock_add_region(&memblock.memory, base, size);
199
200 /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
201 if (base == 0)
202 memblock.rmo_size = size;
203
204 return memblock_add_region(_rgn, base, size);
205 391
206} 392}
207 393
208static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) 394static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
209{ 395{
210 u64 rgnbegin, rgnend; 396 phys_addr_t rgnbegin, rgnend;
211 u64 end = base + size; 397 phys_addr_t end = base + size;
212 int i; 398 int i;
213 399
214 rgnbegin = rgnend = 0; /* supress gcc warnings */ 400 rgnbegin = rgnend = 0; /* supress gcc warnings */
215 401
216 /* Find the region where (base, size) belongs to */ 402 /* Find the region where (base, size) belongs to */
217 for (i=0; i < rgn->cnt; i++) { 403 for (i=0; i < type->cnt; i++) {
218 rgnbegin = rgn->region[i].base; 404 rgnbegin = type->regions[i].base;
219 rgnend = rgnbegin + rgn->region[i].size; 405 rgnend = rgnbegin + type->regions[i].size;
220 406
221 if ((rgnbegin <= base) && (end <= rgnend)) 407 if ((rgnbegin <= base) && (end <= rgnend))
222 break; 408 break;
223 } 409 }
224 410
225 /* Didn't find the region */ 411 /* Didn't find the region */
226 if (i == rgn->cnt) 412 if (i == type->cnt)
227 return -1; 413 return -1;
228 414
229 /* Check to see if we are removing entire region */ 415 /* Check to see if we are removing entire region */
230 if ((rgnbegin == base) && (rgnend == end)) { 416 if ((rgnbegin == base) && (rgnend == end)) {
231 memblock_remove_region(rgn, i); 417 memblock_remove_region(type, i);
232 return 0; 418 return 0;
233 } 419 }
234 420
235 /* Check to see if region is matching at the front */ 421 /* Check to see if region is matching at the front */
236 if (rgnbegin == base) { 422 if (rgnbegin == base) {
237 rgn->region[i].base = end; 423 type->regions[i].base = end;
238 rgn->region[i].size -= size; 424 type->regions[i].size -= size;
239 return 0; 425 return 0;
240 } 426 }
241 427
242 /* Check to see if the region is matching at the end */ 428 /* Check to see if the region is matching at the end */
243 if (rgnend == end) { 429 if (rgnend == end) {
244 rgn->region[i].size -= size; 430 type->regions[i].size -= size;
245 return 0; 431 return 0;
246 } 432 }
247 433
@@ -249,208 +435,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
249 * We need to split the entry - adjust the current one to the 435 * We need to split the entry - adjust the current one to the
250 * beginging of the hole and add the region after hole. 436 * beginging of the hole and add the region after hole.
251 */ 437 */
252 rgn->region[i].size = base - rgn->region[i].base; 438 type->regions[i].size = base - type->regions[i].base;
253 return memblock_add_region(rgn, end, rgnend - end); 439 return memblock_add_region(type, end, rgnend - end);
254} 440}
255 441
256long memblock_remove(u64 base, u64 size) 442long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
257{ 443{
258 return __memblock_remove(&memblock.memory, base, size); 444 return __memblock_remove(&memblock.memory, base, size);
259} 445}
260 446
261long __init memblock_free(u64 base, u64 size) 447long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
262{ 448{
263 return __memblock_remove(&memblock.reserved, base, size); 449 return __memblock_remove(&memblock.reserved, base, size);
264} 450}
265 451
266long __init memblock_reserve(u64 base, u64 size) 452long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
267{ 453{
268 struct memblock_region *_rgn = &memblock.reserved; 454 struct memblock_type *_rgn = &memblock.reserved;
269 455
270 BUG_ON(0 == size); 456 BUG_ON(0 == size);
271 457
272 return memblock_add_region(_rgn, base, size); 458 return memblock_add_region(_rgn, base, size);
273} 459}
274 460
275long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size) 461phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
276{ 462{
277 unsigned long i; 463 phys_addr_t found;
278 464
279 for (i = 0; i < rgn->cnt; i++) { 465 /* We align the size to limit fragmentation. Without this, a lot of
280 u64 rgnbase = rgn->region[i].base; 466 * small allocs quickly eat up the whole reserve array on sparc
281 u64 rgnsize = rgn->region[i].size; 467 */
282 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) 468 size = memblock_align_up(size, align);
283 break; 469
284 } 470 found = memblock_find_base(size, align, 0, max_addr);
471 if (found != MEMBLOCK_ERROR &&
472 memblock_add_region(&memblock.reserved, found, size) >= 0)
473 return found;
285 474
286 return (i < rgn->cnt) ? i : -1; 475 return 0;
287} 476}
288 477
289static u64 memblock_align_down(u64 addr, u64 size) 478phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
290{ 479{
291 return addr & ~(size - 1); 480 phys_addr_t alloc;
481
482 alloc = __memblock_alloc_base(size, align, max_addr);
483
484 if (alloc == 0)
485 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
486 (unsigned long long) size, (unsigned long long) max_addr);
487
488 return alloc;
292} 489}
293 490
294static u64 memblock_align_up(u64 addr, u64 size) 491phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
295{ 492{
296 return (addr + (size - 1)) & ~(size - 1); 493 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
297} 494}
298 495
299static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end, 496
300 u64 size, u64 align) 497/*
498 * Additional node-local allocators. Search for node memory is bottom up
499 * and walks memblock regions within that node bottom-up as well, but allocation
500 * within an memblock region is top-down. XXX I plan to fix that at some stage
501 *
502 * WARNING: Only available after early_node_map[] has been populated,
503 * on some architectures, that is after all the calls to add_active_range()
504 * have been done to populate it.
505 */
506
507phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
301{ 508{
302 u64 base, res_base; 509#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
303 long j; 510 /*
511 * This code originates from sparc which really wants use to walk by addresses
512 * and returns the nid. This is not very convenient for early_pfn_map[] users
513 * as the map isn't sorted yet, and it really wants to be walked by nid.
514 *
515 * For now, I implement the inefficient method below which walks the early
516 * map multiple times. Eventually we may want to use an ARCH config option
517 * to implement a completely different method for both case.
518 */
519 unsigned long start_pfn, end_pfn;
520 int i;
304 521
305 base = memblock_align_down((end - size), align); 522 for (i = 0; i < MAX_NUMNODES; i++) {
306 while (start <= base) { 523 get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
307 j = memblock_overlaps_region(&memblock.reserved, base, size); 524 if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
308 if (j < 0) { 525 continue;
309 /* this area isn't reserved, take it */ 526 *nid = i;
310 if (memblock_add_region(&memblock.reserved, base, size) < 0) 527 return min(end, PFN_PHYS(end_pfn));
311 base = ~(u64)0;
312 return base;
313 }
314 res_base = memblock.reserved.region[j].base;
315 if (res_base < size)
316 break;
317 base = memblock_align_down(res_base - size, align);
318 } 528 }
529#endif
530 *nid = 0;
319 531
320 return ~(u64)0; 532 return end;
321} 533}
322 534
323static u64 __init memblock_alloc_nid_region(struct memblock_property *mp, 535static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
324 u64 (*nid_range)(u64, u64, int *), 536 phys_addr_t size,
325 u64 size, u64 align, int nid) 537 phys_addr_t align, int nid)
326{ 538{
327 u64 start, end; 539 phys_addr_t start, end;
328 540
329 start = mp->base; 541 start = mp->base;
330 end = start + mp->size; 542 end = start + mp->size;
331 543
332 start = memblock_align_up(start, align); 544 start = memblock_align_up(start, align);
333 while (start < end) { 545 while (start < end) {
334 u64 this_end; 546 phys_addr_t this_end;
335 int this_nid; 547 int this_nid;
336 548
337 this_end = nid_range(start, end, &this_nid); 549 this_end = memblock_nid_range(start, end, &this_nid);
338 if (this_nid == nid) { 550 if (this_nid == nid) {
339 u64 ret = memblock_alloc_nid_unreserved(start, this_end, 551 phys_addr_t ret = memblock_find_region(start, this_end, size, align);
340 size, align); 552 if (ret != MEMBLOCK_ERROR &&
341 if (ret != ~(u64)0) 553 memblock_add_region(&memblock.reserved, ret, size) >= 0)
342 return ret; 554 return ret;
343 } 555 }
344 start = this_end; 556 start = this_end;
345 } 557 }
346 558
347 return ~(u64)0; 559 return MEMBLOCK_ERROR;
348} 560}
349 561
350u64 __init memblock_alloc_nid(u64 size, u64 align, int nid, 562phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
351 u64 (*nid_range)(u64 start, u64 end, int *nid))
352{ 563{
353 struct memblock_region *mem = &memblock.memory; 564 struct memblock_type *mem = &memblock.memory;
354 int i; 565 int i;
355 566
356 BUG_ON(0 == size); 567 BUG_ON(0 == size);
357 568
569 /* We align the size to limit fragmentation. Without this, a lot of
570 * small allocs quickly eat up the whole reserve array on sparc
571 */
358 size = memblock_align_up(size, align); 572 size = memblock_align_up(size, align);
359 573
574 /* We do a bottom-up search for a region with the right
575 * nid since that's easier considering how memblock_nid_range()
576 * works
577 */
360 for (i = 0; i < mem->cnt; i++) { 578 for (i = 0; i < mem->cnt; i++) {
361 u64 ret = memblock_alloc_nid_region(&mem->region[i], 579 phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
362 nid_range,
363 size, align, nid); 580 size, align, nid);
364 if (ret != ~(u64)0) 581 if (ret != MEMBLOCK_ERROR)
365 return ret; 582 return ret;
366 } 583 }
367 584
368 return memblock_alloc(size, align); 585 return 0;
369}
370
371u64 __init memblock_alloc(u64 size, u64 align)
372{
373 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
374} 586}
375 587
376u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr) 588phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
377{ 589{
378 u64 alloc; 590 phys_addr_t res = memblock_alloc_nid(size, align, nid);
379 591
380 alloc = __memblock_alloc_base(size, align, max_addr); 592 if (res)
381 593 return res;
382 if (alloc == 0) 594 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
383 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
384 (unsigned long long) size, (unsigned long long) max_addr);
385
386 return alloc;
387} 595}
388 596
389u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
390{
391 long i, j;
392 u64 base = 0;
393 u64 res_base;
394
395 BUG_ON(0 == size);
396
397 size = memblock_align_up(size, align);
398
399 /* On some platforms, make sure we allocate lowmem */
400 /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
401 if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
402 max_addr = MEMBLOCK_REAL_LIMIT;
403
404 for (i = memblock.memory.cnt - 1; i >= 0; i--) {
405 u64 memblockbase = memblock.memory.region[i].base;
406 u64 memblocksize = memblock.memory.region[i].size;
407
408 if (memblocksize < size)
409 continue;
410 if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
411 base = memblock_align_down(memblockbase + memblocksize - size, align);
412 else if (memblockbase < max_addr) {
413 base = min(memblockbase + memblocksize, max_addr);
414 base = memblock_align_down(base - size, align);
415 } else
416 continue;
417 597
418 while (base && memblockbase <= base) { 598/*
419 j = memblock_overlaps_region(&memblock.reserved, base, size); 599 * Remaining API functions
420 if (j < 0) { 600 */
421 /* this area isn't reserved, take it */
422 if (memblock_add_region(&memblock.reserved, base, size) < 0)
423 return 0;
424 return base;
425 }
426 res_base = memblock.reserved.region[j].base;
427 if (res_base < size)
428 break;
429 base = memblock_align_down(res_base - size, align);
430 }
431 }
432 return 0;
433}
434 601
435/* You must call memblock_analyze() before this. */ 602/* You must call memblock_analyze() before this. */
436u64 __init memblock_phys_mem_size(void) 603phys_addr_t __init memblock_phys_mem_size(void)
437{ 604{
438 return memblock.memory.size; 605 return memblock.memory_size;
439} 606}
440 607
441u64 memblock_end_of_DRAM(void) 608phys_addr_t __init_memblock memblock_end_of_DRAM(void)
442{ 609{
443 int idx = memblock.memory.cnt - 1; 610 int idx = memblock.memory.cnt - 1;
444 611
445 return (memblock.memory.region[idx].base + memblock.memory.region[idx].size); 612 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
446} 613}
447 614
448/* You must call memblock_analyze() after this. */ 615/* You must call memblock_analyze() after this. */
449void __init memblock_enforce_memory_limit(u64 memory_limit) 616void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
450{ 617{
451 unsigned long i; 618 unsigned long i;
452 u64 limit; 619 phys_addr_t limit;
453 struct memblock_property *p; 620 struct memblock_region *p;
454 621
455 if (!memory_limit) 622 if (!memory_limit)
456 return; 623 return;
@@ -458,24 +625,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
458 /* Truncate the memblock regions to satisfy the memory limit. */ 625 /* Truncate the memblock regions to satisfy the memory limit. */
459 limit = memory_limit; 626 limit = memory_limit;
460 for (i = 0; i < memblock.memory.cnt; i++) { 627 for (i = 0; i < memblock.memory.cnt; i++) {
461 if (limit > memblock.memory.region[i].size) { 628 if (limit > memblock.memory.regions[i].size) {
462 limit -= memblock.memory.region[i].size; 629 limit -= memblock.memory.regions[i].size;
463 continue; 630 continue;
464 } 631 }
465 632
466 memblock.memory.region[i].size = limit; 633 memblock.memory.regions[i].size = limit;
467 memblock.memory.cnt = i + 1; 634 memblock.memory.cnt = i + 1;
468 break; 635 break;
469 } 636 }
470 637
471 if (memblock.memory.region[0].size < memblock.rmo_size)
472 memblock.rmo_size = memblock.memory.region[0].size;
473
474 memory_limit = memblock_end_of_DRAM(); 638 memory_limit = memblock_end_of_DRAM();
475 639
476 /* And truncate any reserves above the limit also. */ 640 /* And truncate any reserves above the limit also. */
477 for (i = 0; i < memblock.reserved.cnt; i++) { 641 for (i = 0; i < memblock.reserved.cnt; i++) {
478 p = &memblock.reserved.region[i]; 642 p = &memblock.reserved.regions[i];
479 643
480 if (p->base > memory_limit) 644 if (p->base > memory_limit)
481 p->size = 0; 645 p->size = 0;
@@ -489,53 +653,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
489 } 653 }
490} 654}
491 655
492int __init memblock_is_reserved(u64 addr) 656static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
657{
658 unsigned int left = 0, right = type->cnt;
659
660 do {
661 unsigned int mid = (right + left) / 2;
662
663 if (addr < type->regions[mid].base)
664 right = mid;
665 else if (addr >= (type->regions[mid].base +
666 type->regions[mid].size))
667 left = mid + 1;
668 else
669 return mid;
670 } while (left < right);
671 return -1;
672}
673
674int __init memblock_is_reserved(phys_addr_t addr)
675{
676 return memblock_search(&memblock.reserved, addr) != -1;
677}
678
679int __init_memblock memblock_is_memory(phys_addr_t addr)
680{
681 return memblock_search(&memblock.memory, addr) != -1;
682}
683
684int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
685{
686 int idx = memblock_search(&memblock.reserved, base);
687
688 if (idx == -1)
689 return 0;
690 return memblock.reserved.regions[idx].base <= base &&
691 (memblock.reserved.regions[idx].base +
692 memblock.reserved.regions[idx].size) >= (base + size);
693}
694
695int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
696{
697 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
698}
699
700
701void __init_memblock memblock_set_current_limit(phys_addr_t limit)
493{ 702{
703 memblock.current_limit = limit;
704}
705
706static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
707{
708 unsigned long long base, size;
494 int i; 709 int i;
495 710
496 for (i = 0; i < memblock.reserved.cnt; i++) { 711 pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
497 u64 upper = memblock.reserved.region[i].base + 712
498 memblock.reserved.region[i].size - 1; 713 for (i = 0; i < region->cnt; i++) {
499 if ((addr >= memblock.reserved.region[i].base) && (addr <= upper)) 714 base = region->regions[i].base;
500 return 1; 715 size = region->regions[i].size;
716
717 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
718 name, i, base, base + size - 1, size);
501 } 719 }
502 return 0;
503} 720}
504 721
505int memblock_is_region_reserved(u64 base, u64 size) 722void __init_memblock memblock_dump_all(void)
506{ 723{
507 return memblock_overlaps_region(&memblock.reserved, base, size); 724 if (!memblock_debug)
725 return;
726
727 pr_info("MEMBLOCK configuration:\n");
728 pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
729
730 memblock_dump(&memblock.memory, "memory");
731 memblock_dump(&memblock.reserved, "reserved");
508} 732}
509 733
510/* 734void __init memblock_analyze(void)
511 * Given a <base, len>, find which memory regions belong to this range.
512 * Adjust the request and return a contiguous chunk.
513 */
514int memblock_find(struct memblock_property *res)
515{ 735{
516 int i; 736 int i;
517 u64 rstart, rend;
518 737
519 rstart = res->base; 738 /* Check marker in the unused last array entry */
520 rend = rstart + res->size - 1; 739 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
740 != (phys_addr_t)RED_INACTIVE);
741 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
742 != (phys_addr_t)RED_INACTIVE);
743
744 memblock.memory_size = 0;
745
746 for (i = 0; i < memblock.memory.cnt; i++)
747 memblock.memory_size += memblock.memory.regions[i].size;
748
749 /* We allow resizing from there */
750 memblock_can_resize = 1;
751}
752
753void __init memblock_init(void)
754{
755 static int init_done __initdata = 0;
756
757 if (init_done)
758 return;
759 init_done = 1;
760
761 /* Hookup the initial arrays */
762 memblock.memory.regions = memblock_memory_init_regions;
763 memblock.memory.max = INIT_MEMBLOCK_REGIONS;
764 memblock.reserved.regions = memblock_reserved_init_regions;
765 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
766
767 /* Write a marker in the unused last array entry */
768 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
769 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
770
771 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
772 * This simplifies the memblock_add() code below...
773 */
774 memblock.memory.regions[0].base = 0;
775 memblock.memory.regions[0].size = 0;
776 memblock.memory.cnt = 1;
777
778 /* Ditto. */
779 memblock.reserved.regions[0].base = 0;
780 memblock.reserved.regions[0].size = 0;
781 memblock.reserved.cnt = 1;
782
783 memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
784}
785
786static int __init early_memblock(char *p)
787{
788 if (p && strstr(p, "debug"))
789 memblock_debug = 1;
790 return 0;
791}
792early_param("memblock", early_memblock);
793
794#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
795
796static int memblock_debug_show(struct seq_file *m, void *private)
797{
798 struct memblock_type *type = m->private;
799 struct memblock_region *reg;
800 int i;
801
802 for (i = 0; i < type->cnt; i++) {
803 reg = &type->regions[i];
804 seq_printf(m, "%4d: ", i);
805 if (sizeof(phys_addr_t) == 4)
806 seq_printf(m, "0x%08lx..0x%08lx\n",
807 (unsigned long)reg->base,
808 (unsigned long)(reg->base + reg->size - 1));
809 else
810 seq_printf(m, "0x%016llx..0x%016llx\n",
811 (unsigned long long)reg->base,
812 (unsigned long long)(reg->base + reg->size - 1));
521 813
522 for (i = 0; i < memblock.memory.cnt; i++) {
523 u64 start = memblock.memory.region[i].base;
524 u64 end = start + memblock.memory.region[i].size - 1;
525
526 if (start > rend)
527 return -1;
528
529 if ((end >= rstart) && (start < rend)) {
530 /* adjust the request */
531 if (rstart < start)
532 rstart = start;
533 if (rend > end)
534 rend = end;
535 res->base = rstart;
536 res->size = rend - rstart + 1;
537 return 0;
538 }
539 } 814 }
540 return -1; 815 return 0;
816}
817
818static int memblock_debug_open(struct inode *inode, struct file *file)
819{
820 return single_open(file, memblock_debug_show, inode->i_private);
541} 821}
822
823static const struct file_operations memblock_debug_fops = {
824 .open = memblock_debug_open,
825 .read = seq_read,
826 .llseek = seq_lseek,
827 .release = single_release,
828};
829
830static int __init memblock_init_debugfs(void)
831{
832 struct dentry *root = debugfs_create_dir("memblock", NULL);
833 if (!root)
834 return -ENXIO;
835 debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
836 debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
837
838 return 0;
839}
840__initcall(memblock_init_debugfs);
841
842#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af..9a99cfaf0a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,10 +47,13 @@
47#include <linux/mm_inline.h> 47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/oom.h>
50#include "internal.h" 51#include "internal.h"
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53 54
55#include <trace/events/vmscan.h>
56
54struct cgroup_subsys mem_cgroup_subsys __read_mostly; 57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
55#define MEM_CGROUP_RECLAIM_RETRIES 5 58#define MEM_CGROUP_RECLAIM_RETRIES 5
56struct mem_cgroup *root_mem_cgroup __read_mostly; 59struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -86,7 +89,10 @@ enum mem_cgroup_stat_index {
86 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
87 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
89 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 92 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
93 /* incremented at every pagein/pageout */
94 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
95 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
90 96
91 MEM_CGROUP_STAT_NSTATS, 97 MEM_CGROUP_STAT_NSTATS,
92}; 98};
@@ -211,8 +217,6 @@ struct mem_cgroup {
211 */ 217 */
212 spinlock_t reclaim_param_lock; 218 spinlock_t reclaim_param_lock;
213 219
214 int prev_priority; /* for recording reclaim priority */
215
216 /* 220 /*
217 * While reclaiming in a hierarchy, we cache the last child we 221 * While reclaiming in a hierarchy, we cache the last child we
218 * reclaimed from. 222 * reclaimed from.
@@ -253,6 +257,12 @@ struct mem_cgroup {
253 * percpu counter. 257 * percpu counter.
254 */ 258 */
255 struct mem_cgroup_stat_cpu *stat; 259 struct mem_cgroup_stat_cpu *stat;
260 /*
261 * used when a cpu is offlined or other synchronizations
262 * See mem_cgroup_read_stat().
263 */
264 struct mem_cgroup_stat_cpu nocpu_base;
265 spinlock_t pcp_counter_lock;
256}; 266};
257 267
258/* Stuffs for move charges at task migration. */ 268/* Stuffs for move charges at task migration. */
@@ -268,6 +278,7 @@ enum move_type {
268 278
269/* "mc" and its members are protected by cgroup_mutex */ 279/* "mc" and its members are protected by cgroup_mutex */
270static struct move_charge_struct { 280static struct move_charge_struct {
281 spinlock_t lock; /* for from, to, moving_task */
271 struct mem_cgroup *from; 282 struct mem_cgroup *from;
272 struct mem_cgroup *to; 283 struct mem_cgroup *to;
273 unsigned long precharge; 284 unsigned long precharge;
@@ -276,6 +287,7 @@ static struct move_charge_struct {
276 struct task_struct *moving_task; /* a task moving charges */ 287 struct task_struct *moving_task; /* a task moving charges */
277 wait_queue_head_t waitq; /* a waitq for other context */ 288 wait_queue_head_t waitq; /* a waitq for other context */
278} mc = { 289} mc = {
290 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
279 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 291 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
280}; 292};
281 293
@@ -527,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
527 return mz; 539 return mz;
528} 540}
529 541
542/*
543 * Implementation Note: reading percpu statistics for memcg.
544 *
545 * Both of vmstat[] and percpu_counter has threshold and do periodic
546 * synchronization to implement "quick" read. There are trade-off between
547 * reading cost and precision of value. Then, we may have a chance to implement
548 * a periodic synchronizion of counter in memcg's counter.
549 *
550 * But this _read() function is used for user interface now. The user accounts
551 * memory usage by memory cgroup and he _always_ requires exact value because
552 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
553 * have to visit all online cpus and make sum. So, for now, unnecessary
554 * synchronization is not implemented. (just implemented for cpu hotplug)
555 *
556 * If there are kernel internal actions which can make use of some not-exact
557 * value, and reading all cpu value can be performance bottleneck in some
558 * common workload, threashold and synchonization as vmstat[] should be
559 * implemented.
560 */
530static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 561static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
531 enum mem_cgroup_stat_index idx) 562 enum mem_cgroup_stat_index idx)
532{ 563{
533 int cpu; 564 int cpu;
534 s64 val = 0; 565 s64 val = 0;
535 566
536 for_each_possible_cpu(cpu) 567 get_online_cpus();
568 for_each_online_cpu(cpu)
537 val += per_cpu(mem->stat->count[idx], cpu); 569 val += per_cpu(mem->stat->count[idx], cpu);
570#ifdef CONFIG_HOTPLUG_CPU
571 spin_lock(&mem->pcp_counter_lock);
572 val += mem->nocpu_base.count[idx];
573 spin_unlock(&mem->pcp_counter_lock);
574#endif
575 put_online_cpus();
538 return val; 576 return val;
539} 577}
540 578
@@ -656,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
656 return mem; 694 return mem;
657} 695}
658 696
659/* 697/* The caller has to guarantee "mem" exists before calling this */
660 * Call callback function against all cgroup under hierarchy tree. 698static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
661 */
662static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
663 int (*func)(struct mem_cgroup *, void *))
664{ 699{
665 int found, ret, nextid;
666 struct cgroup_subsys_state *css; 700 struct cgroup_subsys_state *css;
667 struct mem_cgroup *mem; 701 int found;
668
669 if (!root->use_hierarchy)
670 return (*func)(root, data);
671 702
672 nextid = 1; 703 if (!mem) /* ROOT cgroup has the smallest ID */
673 do { 704 return root_mem_cgroup; /*css_put/get against root is ignored*/
674 ret = 0; 705 if (!mem->use_hierarchy) {
706 if (css_tryget(&mem->css))
707 return mem;
708 return NULL;
709 }
710 rcu_read_lock();
711 /*
712 * searching a memory cgroup which has the smallest ID under given
713 * ROOT cgroup. (ID >= 1)
714 */
715 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
716 if (css && css_tryget(css))
717 mem = container_of(css, struct mem_cgroup, css);
718 else
675 mem = NULL; 719 mem = NULL;
720 rcu_read_unlock();
721 return mem;
722}
723
724static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
725 struct mem_cgroup *root,
726 bool cond)
727{
728 int nextid = css_id(&iter->css) + 1;
729 int found;
730 int hierarchy_used;
731 struct cgroup_subsys_state *css;
676 732
733 hierarchy_used = iter->use_hierarchy;
734
735 css_put(&iter->css);
736 /* If no ROOT, walk all, ignore hierarchy */
737 if (!cond || (root && !hierarchy_used))
738 return NULL;
739
740 if (!root)
741 root = root_mem_cgroup;
742
743 do {
744 iter = NULL;
677 rcu_read_lock(); 745 rcu_read_lock();
678 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 746
679 &found); 747 css = css_get_next(&mem_cgroup_subsys, nextid,
748 &root->css, &found);
680 if (css && css_tryget(css)) 749 if (css && css_tryget(css))
681 mem = container_of(css, struct mem_cgroup, css); 750 iter = container_of(css, struct mem_cgroup, css);
682 rcu_read_unlock(); 751 rcu_read_unlock();
683 752 /* If css is NULL, no more cgroups will be found */
684 if (mem) {
685 ret = (*func)(mem, data);
686 css_put(&mem->css);
687 }
688 nextid = found + 1; 753 nextid = found + 1;
689 } while (!ret && css); 754 } while (css && !iter);
690 755
691 return ret; 756 return iter;
692} 757}
758/*
759 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
760 * be careful that "break" loop is not allowed. We have reference count.
761 * Instead of that modify "cond" to be false and "continue" to exit the loop.
762 */
763#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
764 for (iter = mem_cgroup_start_loop(root);\
765 iter != NULL;\
766 iter = mem_cgroup_get_next(iter, root, cond))
767
768#define for_each_mem_cgroup_tree(iter, root) \
769 for_each_mem_cgroup_tree_cond(iter, root, true)
770
771#define for_each_mem_cgroup_all(iter) \
772 for_each_mem_cgroup_tree_cond(iter, NULL, true)
773
693 774
694static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 775static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
695{ 776{
@@ -836,12 +917,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
836{ 917{
837 int ret; 918 int ret;
838 struct mem_cgroup *curr = NULL; 919 struct mem_cgroup *curr = NULL;
920 struct task_struct *p;
839 921
840 task_lock(task); 922 p = find_lock_task_mm(task);
841 rcu_read_lock(); 923 if (!p)
842 curr = try_get_mem_cgroup_from_mm(task->mm); 924 return 0;
843 rcu_read_unlock(); 925 curr = try_get_mem_cgroup_from_mm(p->mm);
844 task_unlock(task); 926 task_unlock(p);
845 if (!curr) 927 if (!curr)
846 return 0; 928 return 0;
847 /* 929 /*
@@ -858,35 +940,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
858 return ret; 940 return ret;
859} 941}
860 942
861/*
862 * prev_priority control...this will be used in memory reclaim path.
863 */
864int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
865{
866 int prev_priority;
867
868 spin_lock(&mem->reclaim_param_lock);
869 prev_priority = mem->prev_priority;
870 spin_unlock(&mem->reclaim_param_lock);
871
872 return prev_priority;
873}
874
875void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
876{
877 spin_lock(&mem->reclaim_param_lock);
878 if (priority < mem->prev_priority)
879 mem->prev_priority = priority;
880 spin_unlock(&mem->reclaim_param_lock);
881}
882
883void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
884{
885 spin_lock(&mem->reclaim_param_lock);
886 mem->prev_priority = priority;
887 spin_unlock(&mem->reclaim_param_lock);
888}
889
890static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 943static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
891{ 944{
892 unsigned long active; 945 unsigned long active;
@@ -944,7 +997,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
944 struct zone *zone, 997 struct zone *zone,
945 enum lru_list lru) 998 enum lru_list lru)
946{ 999{
947 int nid = zone->zone_pgdat->node_id; 1000 int nid = zone_to_nid(zone);
948 int zid = zone_idx(zone); 1001 int zid = zone_idx(zone);
949 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1002 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
950 1003
@@ -954,7 +1007,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
954struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1007struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
955 struct zone *zone) 1008 struct zone *zone)
956{ 1009{
957 int nid = zone->zone_pgdat->node_id; 1010 int nid = zone_to_nid(zone);
958 int zid = zone_idx(zone); 1011 int zid = zone_idx(zone);
959 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1012 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
960 1013
@@ -999,7 +1052,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
999 LIST_HEAD(pc_list); 1052 LIST_HEAD(pc_list);
1000 struct list_head *src; 1053 struct list_head *src;
1001 struct page_cgroup *pc, *tmp; 1054 struct page_cgroup *pc, *tmp;
1002 int nid = z->zone_pgdat->node_id; 1055 int nid = zone_to_nid(z);
1003 int zid = zone_idx(z); 1056 int zid = zone_idx(z);
1004 struct mem_cgroup_per_zone *mz; 1057 struct mem_cgroup_per_zone *mz;
1005 int lru = LRU_FILE * file + active; 1058 int lru = LRU_FILE * file + active;
@@ -1038,6 +1091,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1038 } 1091 }
1039 1092
1040 *scanned = scan; 1093 *scanned = scan;
1094
1095 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1096 0, 0, 0, mode);
1097
1041 return nr_taken; 1098 return nr_taken;
1042} 1099}
1043 1100
@@ -1072,11 +1129,90 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1072 return swappiness; 1129 return swappiness;
1073} 1130}
1074 1131
1075static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1132static void mem_cgroup_start_move(struct mem_cgroup *mem)
1076{ 1133{
1077 int *val = data; 1134 int cpu;
1078 (*val)++; 1135
1079 return 0; 1136 get_online_cpus();
1137 spin_lock(&mem->pcp_counter_lock);
1138 for_each_online_cpu(cpu)
1139 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1140 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1141 spin_unlock(&mem->pcp_counter_lock);
1142 put_online_cpus();
1143
1144 synchronize_rcu();
1145}
1146
1147static void mem_cgroup_end_move(struct mem_cgroup *mem)
1148{
1149 int cpu;
1150
1151 if (!mem)
1152 return;
1153 get_online_cpus();
1154 spin_lock(&mem->pcp_counter_lock);
1155 for_each_online_cpu(cpu)
1156 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1157 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1158 spin_unlock(&mem->pcp_counter_lock);
1159 put_online_cpus();
1160}
1161/*
1162 * 2 routines for checking "mem" is under move_account() or not.
1163 *
1164 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1165 * for avoiding race in accounting. If true,
1166 * pc->mem_cgroup may be overwritten.
1167 *
1168 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1169 * under hierarchy of moving cgroups. This is for
1170 * waiting at hith-memory prressure caused by "move".
1171 */
1172
1173static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1174{
1175 VM_BUG_ON(!rcu_read_lock_held());
1176 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1177}
1178
1179static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1180{
1181 struct mem_cgroup *from;
1182 struct mem_cgroup *to;
1183 bool ret = false;
1184 /*
1185 * Unlike task_move routines, we access mc.to, mc.from not under
1186 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1187 */
1188 spin_lock(&mc.lock);
1189 from = mc.from;
1190 to = mc.to;
1191 if (!from)
1192 goto unlock;
1193 if (from == mem || to == mem
1194 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1195 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1196 ret = true;
1197unlock:
1198 spin_unlock(&mc.lock);
1199 return ret;
1200}
1201
1202static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1203{
1204 if (mc.moving_task && current != mc.moving_task) {
1205 if (mem_cgroup_under_move(mem)) {
1206 DEFINE_WAIT(wait);
1207 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1208 /* moving charge context might have finished. */
1209 if (mc.moving_task)
1210 schedule();
1211 finish_wait(&mc.waitq, &wait);
1212 return true;
1213 }
1214 }
1215 return false;
1080} 1216}
1081 1217
1082/** 1218/**
@@ -1153,11 +1289,32 @@ done:
1153static int mem_cgroup_count_children(struct mem_cgroup *mem) 1289static int mem_cgroup_count_children(struct mem_cgroup *mem)
1154{ 1290{
1155 int num = 0; 1291 int num = 0;
1156 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1292 struct mem_cgroup *iter;
1293
1294 for_each_mem_cgroup_tree(iter, mem)
1295 num++;
1157 return num; 1296 return num;
1158} 1297}
1159 1298
1160/* 1299/*
1300 * Return the memory (and swap, if configured) limit for a memcg.
1301 */
1302u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1303{
1304 u64 limit;
1305 u64 memsw;
1306
1307 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1308 total_swap_pages;
1309 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1310 /*
1311 * If memsw is finite and limits the amount of swap space available
1312 * to this memcg, return that limit.
1313 */
1314 return min(limit, memsw);
1315}
1316
1317/*
1161 * Visit the first child (need not be the first child as per the ordering 1318 * Visit the first child (need not be the first child as per the ordering
1162 * of the cgroup list, since we track last_scanned_child) of @mem and use 1319 * of the cgroup list, since we track last_scanned_child) of @mem and use
1163 * that to reclaim free pages from. 1320 * that to reclaim free pages from.
@@ -1262,8 +1419,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1262 /* we use swappiness of local cgroup */ 1419 /* we use swappiness of local cgroup */
1263 if (check_soft) 1420 if (check_soft)
1264 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1421 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1265 noswap, get_swappiness(victim), zone, 1422 noswap, get_swappiness(victim), zone);
1266 zone->zone_pgdat->node_id);
1267 else 1423 else
1268 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1424 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1269 noswap, get_swappiness(victim)); 1425 noswap, get_swappiness(victim));
@@ -1285,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1285 return total; 1441 return total;
1286} 1442}
1287 1443
1288static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1289{
1290 int *val = (int *)data;
1291 int x;
1292 /*
1293 * Logically, we can stop scanning immediately when we find
1294 * a memcg is already locked. But condidering unlock ops and
1295 * creation/removal of memcg, scan-all is simple operation.
1296 */
1297 x = atomic_inc_return(&mem->oom_lock);
1298 *val = max(x, *val);
1299 return 0;
1300}
1301/* 1444/*
1302 * Check OOM-Killer is already running under our hierarchy. 1445 * Check OOM-Killer is already running under our hierarchy.
1303 * If someone is running, return false. 1446 * If someone is running, return false.
1304 */ 1447 */
1305static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1448static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1306{ 1449{
1307 int lock_count = 0; 1450 int x, lock_count = 0;
1451 struct mem_cgroup *iter;
1308 1452
1309 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1453 for_each_mem_cgroup_tree(iter, mem) {
1454 x = atomic_inc_return(&iter->oom_lock);
1455 lock_count = max(x, lock_count);
1456 }
1310 1457
1311 if (lock_count == 1) 1458 if (lock_count == 1)
1312 return true; 1459 return true;
1313 return false; 1460 return false;
1314} 1461}
1315 1462
1316static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1463static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1317{ 1464{
1465 struct mem_cgroup *iter;
1466
1318 /* 1467 /*
1319 * When a new child is created while the hierarchy is under oom, 1468 * When a new child is created while the hierarchy is under oom,
1320 * mem_cgroup_oom_lock() may not be called. We have to use 1469 * mem_cgroup_oom_lock() may not be called. We have to use
1321 * atomic_add_unless() here. 1470 * atomic_add_unless() here.
1322 */ 1471 */
1323 atomic_add_unless(&mem->oom_lock, -1, 0); 1472 for_each_mem_cgroup_tree(iter, mem)
1473 atomic_add_unless(&iter->oom_lock, -1, 0);
1324 return 0; 1474 return 0;
1325} 1475}
1326 1476
1327static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1328{
1329 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1330}
1331 1477
1332static DEFINE_MUTEX(memcg_oom_mutex); 1478static DEFINE_MUTEX(memcg_oom_mutex);
1333static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1479static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1370,7 +1516,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1370 1516
1371static void memcg_oom_recover(struct mem_cgroup *mem) 1517static void memcg_oom_recover(struct mem_cgroup *mem)
1372{ 1518{
1373 if (atomic_read(&mem->oom_lock)) 1519 if (mem && atomic_read(&mem->oom_lock))
1374 memcg_wakeup_oom(mem); 1520 memcg_wakeup_oom(mem);
1375} 1521}
1376 1522
@@ -1425,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1425/* 1571/*
1426 * Currently used to update mapped file statistics, but the routine can be 1572 * Currently used to update mapped file statistics, but the routine can be
1427 * generalized to update other statistics as well. 1573 * generalized to update other statistics as well.
1574 *
1575 * Notes: Race condition
1576 *
1577 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1578 * it tends to be costly. But considering some conditions, we doesn't need
1579 * to do so _always_.
1580 *
1581 * Considering "charge", lock_page_cgroup() is not required because all
1582 * file-stat operations happen after a page is attached to radix-tree. There
1583 * are no race with "charge".
1584 *
1585 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1586 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1587 * if there are race with "uncharge". Statistics itself is properly handled
1588 * by flags.
1589 *
1590 * Considering "move", this is an only case we see a race. To make the race
1591 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1592 * possibility of race condition. If there is, we take a lock.
1428 */ 1593 */
1429void mem_cgroup_update_file_mapped(struct page *page, int val) 1594
1595static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1430{ 1596{
1431 struct mem_cgroup *mem; 1597 struct mem_cgroup *mem;
1432 struct page_cgroup *pc; 1598 struct page_cgroup *pc = lookup_page_cgroup(page);
1599 bool need_unlock = false;
1433 1600
1434 pc = lookup_page_cgroup(page);
1435 if (unlikely(!pc)) 1601 if (unlikely(!pc))
1436 return; 1602 return;
1437 1603
1438 lock_page_cgroup(pc); 1604 rcu_read_lock();
1439 mem = pc->mem_cgroup; 1605 mem = pc->mem_cgroup;
1440 if (!mem || !PageCgroupUsed(pc)) 1606 if (unlikely(!mem || !PageCgroupUsed(pc)))
1441 goto done; 1607 goto out;
1608 /* pc->mem_cgroup is unstable ? */
1609 if (unlikely(mem_cgroup_stealed(mem))) {
1610 /* take a lock against to access pc->mem_cgroup */
1611 lock_page_cgroup(pc);
1612 need_unlock = true;
1613 mem = pc->mem_cgroup;
1614 if (!mem || !PageCgroupUsed(pc))
1615 goto out;
1616 }
1442 1617
1443 /* 1618 this_cpu_add(mem->stat->count[idx], val);
1444 * Preemption is already disabled. We can use __this_cpu_xxx 1619
1445 */ 1620 switch (idx) {
1446 if (val > 0) { 1621 case MEM_CGROUP_STAT_FILE_MAPPED:
1447 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1622 if (val > 0)
1448 SetPageCgroupFileMapped(pc); 1623 SetPageCgroupFileMapped(pc);
1449 } else { 1624 else if (!page_mapped(page))
1450 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1625 ClearPageCgroupFileMapped(pc);
1451 ClearPageCgroupFileMapped(pc); 1626 break;
1627 default:
1628 BUG();
1452 } 1629 }
1453 1630
1454done: 1631out:
1455 unlock_page_cgroup(pc); 1632 if (unlikely(need_unlock))
1633 unlock_page_cgroup(pc);
1634 rcu_read_unlock();
1635 return;
1636}
1637
1638void mem_cgroup_update_file_mapped(struct page *page, int val)
1639{
1640 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1456} 1641}
1457 1642
1458/* 1643/*
@@ -1568,30 +1753,137 @@ static void drain_all_stock_sync(void)
1568 atomic_dec(&memcg_drain_count); 1753 atomic_dec(&memcg_drain_count);
1569} 1754}
1570 1755
1571static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1756/*
1757 * This function drains percpu counter value from DEAD cpu and
1758 * move it to local cpu. Note that this function can be preempted.
1759 */
1760static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1761{
1762 int i;
1763
1764 spin_lock(&mem->pcp_counter_lock);
1765 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1766 s64 x = per_cpu(mem->stat->count[i], cpu);
1767
1768 per_cpu(mem->stat->count[i], cpu) = 0;
1769 mem->nocpu_base.count[i] += x;
1770 }
1771 /* need to clear ON_MOVE value, works as a kind of lock. */
1772 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1773 spin_unlock(&mem->pcp_counter_lock);
1774}
1775
1776static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1777{
1778 int idx = MEM_CGROUP_ON_MOVE;
1779
1780 spin_lock(&mem->pcp_counter_lock);
1781 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1782 spin_unlock(&mem->pcp_counter_lock);
1783}
1784
1785static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1572 unsigned long action, 1786 unsigned long action,
1573 void *hcpu) 1787 void *hcpu)
1574{ 1788{
1575 int cpu = (unsigned long)hcpu; 1789 int cpu = (unsigned long)hcpu;
1576 struct memcg_stock_pcp *stock; 1790 struct memcg_stock_pcp *stock;
1791 struct mem_cgroup *iter;
1792
1793 if ((action == CPU_ONLINE)) {
1794 for_each_mem_cgroup_all(iter)
1795 synchronize_mem_cgroup_on_move(iter, cpu);
1796 return NOTIFY_OK;
1797 }
1577 1798
1578 if (action != CPU_DEAD) 1799 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1579 return NOTIFY_OK; 1800 return NOTIFY_OK;
1801
1802 for_each_mem_cgroup_all(iter)
1803 mem_cgroup_drain_pcp_counter(iter, cpu);
1804
1580 stock = &per_cpu(memcg_stock, cpu); 1805 stock = &per_cpu(memcg_stock, cpu);
1581 drain_stock(stock); 1806 drain_stock(stock);
1582 return NOTIFY_OK; 1807 return NOTIFY_OK;
1583} 1808}
1584 1809
1810
1811/* See __mem_cgroup_try_charge() for details */
1812enum {
1813 CHARGE_OK, /* success */
1814 CHARGE_RETRY, /* need to retry but retry is not bad */
1815 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
1816 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
1817 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1818};
1819
1820static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1821 int csize, bool oom_check)
1822{
1823 struct mem_cgroup *mem_over_limit;
1824 struct res_counter *fail_res;
1825 unsigned long flags = 0;
1826 int ret;
1827
1828 ret = res_counter_charge(&mem->res, csize, &fail_res);
1829
1830 if (likely(!ret)) {
1831 if (!do_swap_account)
1832 return CHARGE_OK;
1833 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1834 if (likely(!ret))
1835 return CHARGE_OK;
1836
1837 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1838 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1839 } else
1840 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1841
1842 if (csize > PAGE_SIZE) /* change csize and retry */
1843 return CHARGE_RETRY;
1844
1845 if (!(gfp_mask & __GFP_WAIT))
1846 return CHARGE_WOULDBLOCK;
1847
1848 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1849 gfp_mask, flags);
1850 /*
1851 * try_to_free_mem_cgroup_pages() might not give us a full
1852 * picture of reclaim. Some pages are reclaimed and might be
1853 * moved to swap cache or just unmapped from the cgroup.
1854 * Check the limit again to see if the reclaim reduced the
1855 * current usage of the cgroup before giving up
1856 */
1857 if (ret || mem_cgroup_check_under_limit(mem_over_limit))
1858 return CHARGE_RETRY;
1859
1860 /*
1861 * At task move, charge accounts can be doubly counted. So, it's
1862 * better to wait until the end of task_move if something is going on.
1863 */
1864 if (mem_cgroup_wait_acct_move(mem_over_limit))
1865 return CHARGE_RETRY;
1866
1867 /* If we don't need to call oom-killer at el, return immediately */
1868 if (!oom_check)
1869 return CHARGE_NOMEM;
1870 /* check OOM */
1871 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1872 return CHARGE_OOM_DIE;
1873
1874 return CHARGE_RETRY;
1875}
1876
1585/* 1877/*
1586 * Unlike exported interface, "oom" parameter is added. if oom==true, 1878 * Unlike exported interface, "oom" parameter is added. if oom==true,
1587 * oom-killer can be invoked. 1879 * oom-killer can be invoked.
1588 */ 1880 */
1589static int __mem_cgroup_try_charge(struct mm_struct *mm, 1881static int __mem_cgroup_try_charge(struct mm_struct *mm,
1590 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1882 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1591{ 1883{
1592 struct mem_cgroup *mem, *mem_over_limit; 1884 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1593 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1885 struct mem_cgroup *mem = NULL;
1594 struct res_counter *fail_res; 1886 int ret;
1595 int csize = CHARGE_SIZE; 1887 int csize = CHARGE_SIZE;
1596 1888
1597 /* 1889 /*
@@ -1609,126 +1901,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1609 * thread group leader migrates. It's possible that mm is not 1901 * thread group leader migrates. It's possible that mm is not
1610 * set, if so charge the init_mm (happens for pagecache usage). 1902 * set, if so charge the init_mm (happens for pagecache usage).
1611 */ 1903 */
1612 mem = *memcg; 1904 if (!*memcg && !mm)
1613 if (likely(!mem)) { 1905 goto bypass;
1614 mem = try_get_mem_cgroup_from_mm(mm); 1906again:
1615 *memcg = mem; 1907 if (*memcg) { /* css should be a valid one */
1616 } else { 1908 mem = *memcg;
1617 css_get(&mem->css); 1909 VM_BUG_ON(css_is_removed(&mem->css));
1618 } 1910 if (mem_cgroup_is_root(mem))
1619 if (unlikely(!mem)) 1911 goto done;
1620 return 0;
1621
1622 VM_BUG_ON(css_is_removed(&mem->css));
1623 if (mem_cgroup_is_root(mem))
1624 goto done;
1625
1626 while (1) {
1627 int ret = 0;
1628 unsigned long flags = 0;
1629
1630 if (consume_stock(mem)) 1912 if (consume_stock(mem))
1631 goto done; 1913 goto done;
1914 css_get(&mem->css);
1915 } else {
1916 struct task_struct *p;
1632 1917
1633 ret = res_counter_charge(&mem->res, csize, &fail_res); 1918 rcu_read_lock();
1634 if (likely(!ret)) { 1919 p = rcu_dereference(mm->owner);
1635 if (!do_swap_account) 1920 VM_BUG_ON(!p);
1636 break;
1637 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1638 if (likely(!ret))
1639 break;
1640 /* mem+swap counter fails */
1641 res_counter_uncharge(&mem->res, csize);
1642 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1643 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1644 memsw);
1645 } else
1646 /* mem counter fails */
1647 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1648 res);
1649
1650 /* reduce request size and retry */
1651 if (csize > PAGE_SIZE) {
1652 csize = PAGE_SIZE;
1653 continue;
1654 }
1655 if (!(gfp_mask & __GFP_WAIT))
1656 goto nomem;
1657
1658 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1659 gfp_mask, flags);
1660 if (ret)
1661 continue;
1662
1663 /* 1921 /*
1664 * try_to_free_mem_cgroup_pages() might not give us a full 1922 * because we don't have task_lock(), "p" can exit while
1665 * picture of reclaim. Some pages are reclaimed and might be 1923 * we're here. In that case, "mem" can point to root
1666 * moved to swap cache or just unmapped from the cgroup. 1924 * cgroup but never be NULL. (and task_struct itself is freed
1667 * Check the limit again to see if the reclaim reduced the 1925 * by RCU, cgroup itself is RCU safe.) Then, we have small
1668 * current usage of the cgroup before giving up 1926 * risk here to get wrong cgroup. But such kind of mis-account
1669 * 1927 * by race always happens because we don't have cgroup_mutex().
1928 * It's overkill and we allow that small race, here.
1670 */ 1929 */
1671 if (mem_cgroup_check_under_limit(mem_over_limit)) 1930 mem = mem_cgroup_from_task(p);
1672 continue; 1931 VM_BUG_ON(!mem);
1673 1932 if (mem_cgroup_is_root(mem)) {
1674 /* try to avoid oom while someone is moving charge */ 1933 rcu_read_unlock();
1675 if (mc.moving_task && current != mc.moving_task) { 1934 goto done;
1676 struct mem_cgroup *from, *to; 1935 }
1677 bool do_continue = false; 1936 if (consume_stock(mem)) {
1678 /* 1937 /*
1679 * There is a small race that "from" or "to" can be 1938 * It seems dagerous to access memcg without css_get().
1680 * freed by rmdir, so we use css_tryget(). 1939 * But considering how consume_stok works, it's not
1940 * necessary. If consume_stock success, some charges
1941 * from this memcg are cached on this cpu. So, we
1942 * don't need to call css_get()/css_tryget() before
1943 * calling consume_stock().
1681 */ 1944 */
1682 from = mc.from; 1945 rcu_read_unlock();
1683 to = mc.to; 1946 goto done;
1684 if (from && css_tryget(&from->css)) { 1947 }
1685 if (mem_over_limit->use_hierarchy) 1948 /* after here, we may be blocked. we need to get refcnt */
1686 do_continue = css_is_ancestor( 1949 if (!css_tryget(&mem->css)) {
1687 &from->css, 1950 rcu_read_unlock();
1688 &mem_over_limit->css); 1951 goto again;
1689 else
1690 do_continue = (from == mem_over_limit);
1691 css_put(&from->css);
1692 }
1693 if (!do_continue && to && css_tryget(&to->css)) {
1694 if (mem_over_limit->use_hierarchy)
1695 do_continue = css_is_ancestor(
1696 &to->css,
1697 &mem_over_limit->css);
1698 else
1699 do_continue = (to == mem_over_limit);
1700 css_put(&to->css);
1701 }
1702 if (do_continue) {
1703 DEFINE_WAIT(wait);
1704 prepare_to_wait(&mc.waitq, &wait,
1705 TASK_INTERRUPTIBLE);
1706 /* moving charge context might have finished. */
1707 if (mc.moving_task)
1708 schedule();
1709 finish_wait(&mc.waitq, &wait);
1710 continue;
1711 }
1712 } 1952 }
1953 rcu_read_unlock();
1954 }
1955
1956 do {
1957 bool oom_check;
1958
1959 /* If killed, bypass charge */
1960 if (fatal_signal_pending(current)) {
1961 css_put(&mem->css);
1962 goto bypass;
1963 }
1964
1965 oom_check = false;
1966 if (oom && !nr_oom_retries) {
1967 oom_check = true;
1968 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1969 }
1970
1971 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
1713 1972
1714 if (!nr_retries--) { 1973 switch (ret) {
1715 if (!oom) 1974 case CHARGE_OK:
1975 break;
1976 case CHARGE_RETRY: /* not in OOM situation but retry */
1977 csize = PAGE_SIZE;
1978 css_put(&mem->css);
1979 mem = NULL;
1980 goto again;
1981 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
1982 css_put(&mem->css);
1983 goto nomem;
1984 case CHARGE_NOMEM: /* OOM routine works */
1985 if (!oom) {
1986 css_put(&mem->css);
1716 goto nomem; 1987 goto nomem;
1717 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1718 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1719 continue;
1720 } 1988 }
1721 /* When we reach here, current task is dying .*/ 1989 /* If oom, we never return -ENOMEM */
1990 nr_oom_retries--;
1991 break;
1992 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
1722 css_put(&mem->css); 1993 css_put(&mem->css);
1723 goto bypass; 1994 goto bypass;
1724 } 1995 }
1725 } 1996 } while (ret != CHARGE_OK);
1997
1726 if (csize > PAGE_SIZE) 1998 if (csize > PAGE_SIZE)
1727 refill_stock(mem, csize - PAGE_SIZE); 1999 refill_stock(mem, csize - PAGE_SIZE);
2000 css_put(&mem->css);
1728done: 2001done:
2002 *memcg = mem;
1729 return 0; 2003 return 0;
1730nomem: 2004nomem:
1731 css_put(&mem->css); 2005 *memcg = NULL;
1732 return -ENOMEM; 2006 return -ENOMEM;
1733bypass: 2007bypass:
1734 *memcg = NULL; 2008 *memcg = NULL;
@@ -1747,11 +2021,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1747 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 2021 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1748 if (do_swap_account) 2022 if (do_swap_account)
1749 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 2023 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1750 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1751 WARN_ON_ONCE(count > INT_MAX);
1752 __css_put(&mem->css, (int)count);
1753 } 2024 }
1754 /* we don't need css_put for root */
1755} 2025}
1756 2026
1757static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 2027static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1979,10 +2249,9 @@ out:
1979 * < 0 if the cgroup is over its limit 2249 * < 0 if the cgroup is over its limit
1980 */ 2250 */
1981static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2251static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1982 gfp_t gfp_mask, enum charge_type ctype, 2252 gfp_t gfp_mask, enum charge_type ctype)
1983 struct mem_cgroup *memcg)
1984{ 2253{
1985 struct mem_cgroup *mem; 2254 struct mem_cgroup *mem = NULL;
1986 struct page_cgroup *pc; 2255 struct page_cgroup *pc;
1987 int ret; 2256 int ret;
1988 2257
@@ -1992,7 +2261,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1992 return 0; 2261 return 0;
1993 prefetchw(pc); 2262 prefetchw(pc);
1994 2263
1995 mem = memcg;
1996 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2264 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1997 if (ret || !mem) 2265 if (ret || !mem)
1998 return ret; 2266 return ret;
@@ -2020,7 +2288,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2020 if (unlikely(!mm)) 2288 if (unlikely(!mm))
2021 mm = &init_mm; 2289 mm = &init_mm;
2022 return mem_cgroup_charge_common(page, mm, gfp_mask, 2290 return mem_cgroup_charge_common(page, mm, gfp_mask,
2023 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 2291 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2024} 2292}
2025 2293
2026static void 2294static void
@@ -2030,7 +2298,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2030int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2298int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2031 gfp_t gfp_mask) 2299 gfp_t gfp_mask)
2032{ 2300{
2033 struct mem_cgroup *mem = NULL;
2034 int ret; 2301 int ret;
2035 2302
2036 if (mem_cgroup_disabled()) 2303 if (mem_cgroup_disabled())
@@ -2051,7 +2318,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2051 if (!(gfp_mask & __GFP_WAIT)) { 2318 if (!(gfp_mask & __GFP_WAIT)) {
2052 struct page_cgroup *pc; 2319 struct page_cgroup *pc;
2053 2320
2054
2055 pc = lookup_page_cgroup(page); 2321 pc = lookup_page_cgroup(page);
2056 if (!pc) 2322 if (!pc)
2057 return 0; 2323 return 0;
@@ -2063,22 +2329,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2063 unlock_page_cgroup(pc); 2329 unlock_page_cgroup(pc);
2064 } 2330 }
2065 2331
2066 if (unlikely(!mm && !mem)) 2332 if (unlikely(!mm))
2067 mm = &init_mm; 2333 mm = &init_mm;
2068 2334
2069 if (page_is_file_cache(page)) 2335 if (page_is_file_cache(page))
2070 return mem_cgroup_charge_common(page, mm, gfp_mask, 2336 return mem_cgroup_charge_common(page, mm, gfp_mask,
2071 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 2337 MEM_CGROUP_CHARGE_TYPE_CACHE);
2072 2338
2073 /* shmem */ 2339 /* shmem */
2074 if (PageSwapCache(page)) { 2340 if (PageSwapCache(page)) {
2341 struct mem_cgroup *mem = NULL;
2342
2075 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2343 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2076 if (!ret) 2344 if (!ret)
2077 __mem_cgroup_commit_charge_swapin(page, mem, 2345 __mem_cgroup_commit_charge_swapin(page, mem,
2078 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2346 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2079 } else 2347 } else
2080 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2348 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2081 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 2349 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2082 2350
2083 return ret; 2351 return ret;
2084} 2352}
@@ -2114,7 +2382,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2114 goto charge_cur_mm; 2382 goto charge_cur_mm;
2115 *ptr = mem; 2383 *ptr = mem;
2116 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2384 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2117 /* drop extra refcnt from tryget */
2118 css_put(&mem->css); 2385 css_put(&mem->css);
2119 return ret; 2386 return ret;
2120charge_cur_mm: 2387charge_cur_mm:
@@ -2245,7 +2512,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2245{ 2512{
2246 struct page_cgroup *pc; 2513 struct page_cgroup *pc;
2247 struct mem_cgroup *mem = NULL; 2514 struct mem_cgroup *mem = NULL;
2248 struct mem_cgroup_per_zone *mz;
2249 2515
2250 if (mem_cgroup_disabled()) 2516 if (mem_cgroup_disabled())
2251 return NULL; 2517 return NULL;
@@ -2285,10 +2551,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2285 break; 2551 break;
2286 } 2552 }
2287 2553
2288 if (!mem_cgroup_is_root(mem))
2289 __do_uncharge(mem, ctype);
2290 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2291 mem_cgroup_swap_statistics(mem, true);
2292 mem_cgroup_charge_statistics(mem, pc, false); 2554 mem_cgroup_charge_statistics(mem, pc, false);
2293 2555
2294 ClearPageCgroupUsed(pc); 2556 ClearPageCgroupUsed(pc);
@@ -2299,13 +2561,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2299 * special functions. 2561 * special functions.
2300 */ 2562 */
2301 2563
2302 mz = page_cgroup_zoneinfo(pc);
2303 unlock_page_cgroup(pc); 2564 unlock_page_cgroup(pc);
2304 2565 /*
2566 * even after unlock, we have mem->res.usage here and this memcg
2567 * will never be freed.
2568 */
2305 memcg_check_events(mem, page); 2569 memcg_check_events(mem, page);
2306 /* at swapout, this memcg will be accessed to record to swap */ 2570 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2307 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2571 mem_cgroup_swap_statistics(mem, true);
2308 css_put(&mem->css); 2572 mem_cgroup_get(mem);
2573 }
2574 if (!mem_cgroup_is_root(mem))
2575 __do_uncharge(mem, ctype);
2309 2576
2310 return mem; 2577 return mem;
2311 2578
@@ -2392,13 +2659,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2392 2659
2393 memcg = __mem_cgroup_uncharge_common(page, ctype); 2660 memcg = __mem_cgroup_uncharge_common(page, ctype);
2394 2661
2395 /* record memcg information */ 2662 /*
2396 if (do_swap_account && swapout && memcg) { 2663 * record memcg information, if swapout && memcg != NULL,
2664 * mem_cgroup_get() was called in uncharge().
2665 */
2666 if (do_swap_account && swapout && memcg)
2397 swap_cgroup_record(ent, css_id(&memcg->css)); 2667 swap_cgroup_record(ent, css_id(&memcg->css));
2398 mem_cgroup_get(memcg);
2399 }
2400 if (swapout && memcg)
2401 css_put(&memcg->css);
2402} 2668}
2403#endif 2669#endif
2404 2670
@@ -2476,7 +2742,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
2476 */ 2742 */
2477 if (!mem_cgroup_is_root(to)) 2743 if (!mem_cgroup_is_root(to))
2478 res_counter_uncharge(&to->res, PAGE_SIZE); 2744 res_counter_uncharge(&to->res, PAGE_SIZE);
2479 css_put(&to->css);
2480 } 2745 }
2481 return 0; 2746 return 0;
2482 } 2747 }
@@ -2611,11 +2876,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2611 ClearPageCgroupMigration(pc); 2876 ClearPageCgroupMigration(pc);
2612 unlock_page_cgroup(pc); 2877 unlock_page_cgroup(pc);
2613 2878
2614 if (unused != oldpage)
2615 pc = lookup_page_cgroup(unused);
2616 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 2879 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2617 2880
2618 pc = lookup_page_cgroup(used);
2619 /* 2881 /*
2620 * If a page is a file cache, radix-tree replacement is very atomic 2882 * If a page is a file cache, radix-tree replacement is very atomic
2621 * and we can skip this check. When it was an Anon page, its mapcount 2883 * and we can skip this check. When it was an Anon page, its mapcount
@@ -2791,8 +3053,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2791} 3053}
2792 3054
2793unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3055unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2794 gfp_t gfp_mask, int nid, 3056 gfp_t gfp_mask)
2795 int zid)
2796{ 3057{
2797 unsigned long nr_reclaimed = 0; 3058 unsigned long nr_reclaimed = 0;
2798 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3059 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2804,7 +3065,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2804 if (order > 0) 3065 if (order > 0)
2805 return 0; 3066 return 0;
2806 3067
2807 mctz = soft_limit_tree_node_zone(nid, zid); 3068 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2808 /* 3069 /*
2809 * This loop can run a while, specially if mem_cgroup's continuously 3070 * This loop can run a while, specially if mem_cgroup's continuously
2810 * keep exceeding their soft limit and putting the system under 3071 * keep exceeding their soft limit and putting the system under
@@ -2965,6 +3226,7 @@ move_account:
2965 lru_add_drain_all(); 3226 lru_add_drain_all();
2966 drain_all_stock_sync(); 3227 drain_all_stock_sync();
2967 ret = 0; 3228 ret = 0;
3229 mem_cgroup_start_move(mem);
2968 for_each_node_state(node, N_HIGH_MEMORY) { 3230 for_each_node_state(node, N_HIGH_MEMORY) {
2969 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3231 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2970 enum lru_list l; 3232 enum lru_list l;
@@ -2978,6 +3240,7 @@ move_account:
2978 if (ret) 3240 if (ret)
2979 break; 3241 break;
2980 } 3242 }
3243 mem_cgroup_end_move(mem);
2981 memcg_oom_recover(mem); 3244 memcg_oom_recover(mem);
2982 /* it seems parent cgroup doesn't have enough mem */ 3245 /* it seems parent cgroup doesn't have enough mem */
2983 if (ret == -ENOMEM) 3246 if (ret == -ENOMEM)
@@ -3064,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3064 return retval; 3327 return retval;
3065} 3328}
3066 3329
3067struct mem_cgroup_idx_data {
3068 s64 val;
3069 enum mem_cgroup_stat_index idx;
3070};
3071 3330
3072static int 3331static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3073mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3332 enum mem_cgroup_stat_index idx)
3074{ 3333{
3075 struct mem_cgroup_idx_data *d = data; 3334 struct mem_cgroup *iter;
3076 d->val += mem_cgroup_read_stat(mem, d->idx); 3335 s64 val = 0;
3077 return 0;
3078}
3079 3336
3080static void 3337 /* each per cpu's value can be minus.Then, use s64 */
3081mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3338 for_each_mem_cgroup_tree(iter, mem)
3082 enum mem_cgroup_stat_index idx, s64 *val) 3339 val += mem_cgroup_read_stat(iter, idx);
3083{ 3340
3084 struct mem_cgroup_idx_data d; 3341 if (val < 0) /* race ? */
3085 d.idx = idx; 3342 val = 0;
3086 d.val = 0; 3343 return val;
3087 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3088 *val = d.val;
3089} 3344}
3090 3345
3091static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3346static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3092{ 3347{
3093 u64 idx_val, val; 3348 u64 val;
3094 3349
3095 if (!mem_cgroup_is_root(mem)) { 3350 if (!mem_cgroup_is_root(mem)) {
3096 if (!swap) 3351 if (!swap)
@@ -3099,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3099 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3354 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3100 } 3355 }
3101 3356
3102 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3357 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
3103 val = idx_val; 3358 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
3104 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3105 val += idx_val;
3106 3359
3107 if (swap) { 3360 if (swap)
3108 mem_cgroup_get_recursive_idx_stat(mem, 3361 val += mem_cgroup_get_recursive_idx_stat(mem,
3109 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3362 MEM_CGROUP_STAT_SWAPOUT);
3110 val += idx_val;
3111 }
3112 3363
3113 return val << PAGE_SHIFT; 3364 return val << PAGE_SHIFT;
3114} 3365}
@@ -3316,9 +3567,9 @@ struct {
3316}; 3567};
3317 3568
3318 3569
3319static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3570static void
3571mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3320{ 3572{
3321 struct mcs_total_stat *s = data;
3322 s64 val; 3573 s64 val;
3323 3574
3324 /* per cpu stat */ 3575 /* per cpu stat */
@@ -3348,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3348 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3599 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3349 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3600 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3350 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3601 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3351 return 0;
3352} 3602}
3353 3603
3354static void 3604static void
3355mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3605mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3356{ 3606{
3357 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3607 struct mem_cgroup *iter;
3608
3609 for_each_mem_cgroup_tree(iter, mem)
3610 mem_cgroup_get_local_stat(iter, s);
3358} 3611}
3359 3612
3360static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3613static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3514,9 +3767,13 @@ unlock:
3514 3767
3515static void mem_cgroup_threshold(struct mem_cgroup *memcg) 3768static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3516{ 3769{
3517 __mem_cgroup_threshold(memcg, false); 3770 while (memcg) {
3518 if (do_swap_account) 3771 __mem_cgroup_threshold(memcg, false);
3519 __mem_cgroup_threshold(memcg, true); 3772 if (do_swap_account)
3773 __mem_cgroup_threshold(memcg, true);
3774
3775 memcg = parent_mem_cgroup(memcg);
3776 }
3520} 3777}
3521 3778
3522static int compare_thresholds(const void *a, const void *b) 3779static int compare_thresholds(const void *a, const void *b)
@@ -3527,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b)
3527 return _a->threshold - _b->threshold; 3784 return _a->threshold - _b->threshold;
3528} 3785}
3529 3786
3530static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3787static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
3531{ 3788{
3532 struct mem_cgroup_eventfd_list *ev; 3789 struct mem_cgroup_eventfd_list *ev;
3533 3790
@@ -3538,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3538 3795
3539static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3796static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3540{ 3797{
3541 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3798 struct mem_cgroup *iter;
3799
3800 for_each_mem_cgroup_tree(iter, mem)
3801 mem_cgroup_oom_notify_cb(iter);
3542} 3802}
3543 3803
3544static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3804static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -3759,8 +4019,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3759 return 0; 4019 return 0;
3760} 4020}
3761 4021
3762/*
3763 */
3764static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4022static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3765 struct cftype *cft, u64 val) 4023 struct cftype *cft, u64 val)
3766{ 4024{
@@ -3957,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
3957 vfree(mem); 4215 vfree(mem);
3958 mem = NULL; 4216 mem = NULL;
3959 } 4217 }
4218 spin_lock_init(&mem->pcp_counter_lock);
3960 return mem; 4219 return mem;
3961} 4220}
3962 4221
@@ -4083,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4083 &per_cpu(memcg_stock, cpu); 4342 &per_cpu(memcg_stock, cpu);
4084 INIT_WORK(&stock->work, drain_local_stock); 4343 INIT_WORK(&stock->work, drain_local_stock);
4085 } 4344 }
4086 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4345 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4087 } else { 4346 } else {
4088 parent = mem_cgroup_from_cont(cont->parent); 4347 parent = mem_cgroup_from_cont(cont->parent);
4089 mem->use_hierarchy = parent->use_hierarchy; 4348 mem->use_hierarchy = parent->use_hierarchy;
@@ -4180,9 +4439,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
4180 goto one_by_one; 4439 goto one_by_one;
4181 } 4440 }
4182 mc.precharge += count; 4441 mc.precharge += count;
4183 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4184 WARN_ON_ONCE(count > INT_MAX);
4185 __css_get(&mem->css, (int)count);
4186 return ret; 4442 return ret;
4187 } 4443 }
4188one_by_one: 4444one_by_one:
@@ -4400,11 +4656,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4400 4656
4401static void mem_cgroup_clear_mc(void) 4657static void mem_cgroup_clear_mc(void)
4402{ 4658{
4659 struct mem_cgroup *from = mc.from;
4660 struct mem_cgroup *to = mc.to;
4661
4403 /* we must uncharge all the leftover precharges from mc.to */ 4662 /* we must uncharge all the leftover precharges from mc.to */
4404 if (mc.precharge) { 4663 if (mc.precharge) {
4405 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4664 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4406 mc.precharge = 0; 4665 mc.precharge = 0;
4407 memcg_oom_recover(mc.to);
4408 } 4666 }
4409 /* 4667 /*
4410 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4668 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4413,11 +4671,9 @@ static void mem_cgroup_clear_mc(void)
4413 if (mc.moved_charge) { 4671 if (mc.moved_charge) {
4414 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4672 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4415 mc.moved_charge = 0; 4673 mc.moved_charge = 0;
4416 memcg_oom_recover(mc.from);
4417 } 4674 }
4418 /* we must fixup refcnts and charges */ 4675 /* we must fixup refcnts and charges */
4419 if (mc.moved_swap) { 4676 if (mc.moved_swap) {
4420 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4421 /* uncharge swap account from the old cgroup */ 4677 /* uncharge swap account from the old cgroup */
4422 if (!mem_cgroup_is_root(mc.from)) 4678 if (!mem_cgroup_is_root(mc.from))
4423 res_counter_uncharge(&mc.from->memsw, 4679 res_counter_uncharge(&mc.from->memsw,
@@ -4431,16 +4687,19 @@ static void mem_cgroup_clear_mc(void)
4431 */ 4687 */
4432 res_counter_uncharge(&mc.to->res, 4688 res_counter_uncharge(&mc.to->res,
4433 PAGE_SIZE * mc.moved_swap); 4689 PAGE_SIZE * mc.moved_swap);
4434 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4435 __css_put(&mc.to->css, mc.moved_swap);
4436 } 4690 }
4437 /* we've already done mem_cgroup_get(mc.to) */ 4691 /* we've already done mem_cgroup_get(mc.to) */
4438 4692
4439 mc.moved_swap = 0; 4693 mc.moved_swap = 0;
4440 } 4694 }
4695 spin_lock(&mc.lock);
4441 mc.from = NULL; 4696 mc.from = NULL;
4442 mc.to = NULL; 4697 mc.to = NULL;
4443 mc.moving_task = NULL; 4698 mc.moving_task = NULL;
4699 spin_unlock(&mc.lock);
4700 mem_cgroup_end_move(from);
4701 memcg_oom_recover(from);
4702 memcg_oom_recover(to);
4444 wake_up_all(&mc.waitq); 4703 wake_up_all(&mc.waitq);
4445} 4704}
4446 4705
@@ -4469,12 +4728,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4469 VM_BUG_ON(mc.moved_charge); 4728 VM_BUG_ON(mc.moved_charge);
4470 VM_BUG_ON(mc.moved_swap); 4729 VM_BUG_ON(mc.moved_swap);
4471 VM_BUG_ON(mc.moving_task); 4730 VM_BUG_ON(mc.moving_task);
4731 mem_cgroup_start_move(from);
4732 spin_lock(&mc.lock);
4472 mc.from = from; 4733 mc.from = from;
4473 mc.to = mem; 4734 mc.to = mem;
4474 mc.precharge = 0; 4735 mc.precharge = 0;
4475 mc.moved_charge = 0; 4736 mc.moved_charge = 0;
4476 mc.moved_swap = 0; 4737 mc.moved_swap = 0;
4477 mc.moving_task = current; 4738 mc.moving_task = current;
4739 spin_unlock(&mc.lock);
4478 4740
4479 ret = mem_cgroup_precharge_mc(mm); 4741 ret = mem_cgroup_precharge_mc(mm);
4480 if (ret) 4742 if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 620b0b46159..124324134ff 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
7 * Free Software Foundation. 7 * Free Software Foundation.
8 * 8 *
9 * High level machine check handler. Handles pages reported by the 9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11 * failure. 11 * failure.
12 *
13 * In addition there is a "soft offline" entry point that allows stop using
14 * not-yet-corrupted-by-suspicious pages without killing anything.
12 * 15 *
13 * Handles page cache pages in various states. The tricky part 16 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM 17 * here is that we can access any page asynchronously in respect to
15 * users, because memory failures could happen anytime and anywhere, 18 * other VM users, because memory failures could happen anytime and
16 * possibly violating some of their assumptions. This is why this code 19 * anywhere. This could violate some of their assumptions. This is why
17 * has to be extremely careful. Generally it tries to use normal locking 20 * this code has to be extremely careful. Generally it tries to use
18 * rules, as in get the standard locks, even if that means the 21 * normal locking rules, as in get the standard locks, even if that means
19 * error handling takes potentially a long time. 22 * the error handling takes potentially a long time.
20 * 23 *
21 * The operation to map back from RMAP chains to processes has to walk 24 * There are several operations here with exponential complexity because
22 * the complete process list and has non linear complexity with the number 25 * of unsuitable VM data structures. For example the operation to map back
23 * mappings. In short it can be quite slow. But since memory corruptions 26 * from RMAP chains to processes has to walk the complete process list and
24 * are rare we hope to get away with this. 27 * has non linear complexity with the number. But since memory corruptions
28 * are rare we hope to get away with this. This avoids impacting the core
29 * VM.
25 */ 30 */
26 31
27/* 32/*
@@ -30,7 +35,6 @@
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel 36 * - pass bad pages to kdump next kernel
32 */ 37 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h> 38#include <linux/kernel.h>
35#include <linux/mm.h> 39#include <linux/mm.h>
36#include <linux/page-flags.h> 40#include <linux/page-flags.h>
@@ -45,6 +49,8 @@
45#include <linux/page-isolation.h> 49#include <linux/page-isolation.h>
46#include <linux/suspend.h> 50#include <linux/suspend.h>
47#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/swapops.h>
53#include <linux/hugetlb.h>
48#include "internal.h" 54#include "internal.h"
49 55
50int sysctl_memory_failure_early_kill __read_mostly = 0; 56int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -76,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
76 return 0; 82 return 0;
77 83
78 /* 84 /*
79 * page_mapping() does not accept slab page 85 * page_mapping() does not accept slab pages.
80 */ 86 */
81 if (PageSlab(p)) 87 if (PageSlab(p))
82 return -EINVAL; 88 return -EINVAL;
@@ -181,7 +187,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
181 * signal. 187 * signal.
182 */ 188 */
183static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, 189static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
184 unsigned long pfn) 190 unsigned long pfn, struct page *page)
185{ 191{
186 struct siginfo si; 192 struct siginfo si;
187 int ret; 193 int ret;
@@ -196,7 +202,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
196#ifdef __ARCH_SI_TRAPNO 202#ifdef __ARCH_SI_TRAPNO
197 si.si_trapno = trapno; 203 si.si_trapno = trapno;
198#endif 204#endif
199 si.si_addr_lsb = PAGE_SHIFT; 205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
200 /* 206 /*
201 * Don't use force here, it's convenient if the signal 207 * Don't use force here, it's convenient if the signal
202 * can be temporarily blocked. 208 * can be temporarily blocked.
@@ -233,7 +239,7 @@ void shake_page(struct page *p, int access)
233 int nr; 239 int nr;
234 do { 240 do {
235 nr = shrink_slab(1000, GFP_KERNEL, 1000); 241 nr = shrink_slab(1000, GFP_KERNEL, 1000);
236 if (page_count(p) == 0) 242 if (page_count(p) == 1)
237 break; 243 break;
238 } while (nr > 10); 244 } while (nr > 10);
239 } 245 }
@@ -266,7 +272,7 @@ struct to_kill {
266 struct list_head nd; 272 struct list_head nd;
267 struct task_struct *tsk; 273 struct task_struct *tsk;
268 unsigned long addr; 274 unsigned long addr;
269 unsigned addr_valid:1; 275 char addr_valid;
270}; 276};
271 277
272/* 278/*
@@ -307,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
307 * a SIGKILL because the error is not contained anymore. 313 * a SIGKILL because the error is not contained anymore.
308 */ 314 */
309 if (tk->addr == -EFAULT) { 315 if (tk->addr == -EFAULT) {
310 pr_debug("MCE: Unable to find user space address %lx in %s\n", 316 pr_info("MCE: Unable to find user space address %lx in %s\n",
311 page_to_pfn(p), tsk->comm); 317 page_to_pfn(p), tsk->comm);
312 tk->addr_valid = 0; 318 tk->addr_valid = 0;
313 } 319 }
@@ -325,7 +331,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
325 * wrong earlier. 331 * wrong earlier.
326 */ 332 */
327static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, 333static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
328 int fail, unsigned long pfn) 334 int fail, struct page *page, unsigned long pfn)
329{ 335{
330 struct to_kill *tk, *next; 336 struct to_kill *tk, *next;
331 337
@@ -350,7 +356,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
350 * process anyways. 356 * process anyways.
351 */ 357 */
352 else if (kill_proc_ao(tk->tsk, tk->addr, trapno, 358 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
353 pfn) < 0) 359 pfn, page) < 0)
354 printk(KERN_ERR 360 printk(KERN_ERR
355 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", 361 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
356 pfn, tk->tsk->comm, tk->tsk->pid); 362 pfn, tk->tsk->comm, tk->tsk->pid);
@@ -575,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
575 pfn, err); 581 pfn, err);
576 } else if (page_has_private(p) && 582 } else if (page_has_private(p) &&
577 !try_to_release_page(p, GFP_NOIO)) { 583 !try_to_release_page(p, GFP_NOIO)) {
578 pr_debug("MCE %#lx: failed to release buffers\n", pfn); 584 pr_info("MCE %#lx: failed to release buffers\n", pfn);
579 } else { 585 } else {
580 ret = RECOVERED; 586 ret = RECOVERED;
581 } 587 }
@@ -689,17 +695,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
689/* 695/*
690 * Huge pages. Needs work. 696 * Huge pages. Needs work.
691 * Issues: 697 * Issues:
692 * No rmap support so we cannot find the original mapper. In theory could walk 698 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
693 * all MMs and look for the mappings, but that would be non atomic and racy. 699 * To narrow down kill region to one page, we need to break up pmd.
694 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
695 * like just walking the current process and hoping it has it mapped (that
696 * should be usually true for the common "shared database cache" case)
697 * Should handle free huge pages and dequeue them too, but this needs to
698 * handle huge page accounting correctly.
699 */ 700 */
700static int me_huge_page(struct page *p, unsigned long pfn) 701static int me_huge_page(struct page *p, unsigned long pfn)
701{ 702{
702 return FAILED; 703 int res = 0;
704 struct page *hpage = compound_head(p);
705 /*
706 * We can safely recover from error on free or reserved (i.e.
707 * not in-use) hugepage by dequeuing it from freelist.
708 * To check whether a hugepage is in-use or not, we can't use
709 * page->lru because it can be used in other hugepage operations,
710 * such as __unmap_hugepage_range() and gather_surplus_pages().
711 * So instead we use page_mapping() and PageAnon().
712 * We assume that this function is called with page lock held,
713 * so there is no race between isolation and mapping/unmapping.
714 */
715 if (!(page_mapping(hpage) || PageAnon(hpage))) {
716 res = dequeue_hwpoisoned_huge_page(hpage);
717 if (!res)
718 return RECOVERED;
719 }
720 return DELAYED;
703} 721}
704 722
705/* 723/*
@@ -822,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
822 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 840 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
823} 841}
824 842
825#define N_UNMAP_TRIES 5
826
827/* 843/*
828 * Do all that is necessary to remove user space mappings. Unmap 844 * Do all that is necessary to remove user space mappings. Unmap
829 * the pages and send SIGBUS to the processes if the data was dirty. 845 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -835,8 +851,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
835 struct address_space *mapping; 851 struct address_space *mapping;
836 LIST_HEAD(tokill); 852 LIST_HEAD(tokill);
837 int ret; 853 int ret;
838 int i;
839 int kill = 1; 854 int kill = 1;
855 struct page *hpage = compound_head(p);
840 856
841 if (PageReserved(p) || PageSlab(p)) 857 if (PageReserved(p) || PageSlab(p))
842 return SWAP_SUCCESS; 858 return SWAP_SUCCESS;
@@ -845,10 +861,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
845 * This check implies we don't kill processes if their pages 861 * This check implies we don't kill processes if their pages
846 * are in the swap cache early. Those are always late kills. 862 * are in the swap cache early. Those are always late kills.
847 */ 863 */
848 if (!page_mapped(p)) 864 if (!page_mapped(hpage))
849 return SWAP_SUCCESS; 865 return SWAP_SUCCESS;
850 866
851 if (PageCompound(p) || PageKsm(p)) 867 if (PageKsm(p))
852 return SWAP_FAIL; 868 return SWAP_FAIL;
853 869
854 if (PageSwapCache(p)) { 870 if (PageSwapCache(p)) {
@@ -863,10 +879,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
863 * XXX: the dirty test could be racy: set_page_dirty() may not always 879 * XXX: the dirty test could be racy: set_page_dirty() may not always
864 * be called inside page lock (it's recommended but not enforced). 880 * be called inside page lock (it's recommended but not enforced).
865 */ 881 */
866 mapping = page_mapping(p); 882 mapping = page_mapping(hpage);
867 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 883 if (!PageDirty(hpage) && mapping &&
868 if (page_mkclean(p)) { 884 mapping_cap_writeback_dirty(mapping)) {
869 SetPageDirty(p); 885 if (page_mkclean(hpage)) {
886 SetPageDirty(hpage);
870 } else { 887 } else {
871 kill = 0; 888 kill = 0;
872 ttu |= TTU_IGNORE_HWPOISON; 889 ttu |= TTU_IGNORE_HWPOISON;
@@ -885,22 +902,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
885 * there's nothing that can be done. 902 * there's nothing that can be done.
886 */ 903 */
887 if (kill) 904 if (kill)
888 collect_procs(p, &tokill); 905 collect_procs(hpage, &tokill);
889
890 /*
891 * try_to_unmap can fail temporarily due to races.
892 * Try a few times (RED-PEN better strategy?)
893 */
894 for (i = 0; i < N_UNMAP_TRIES; i++) {
895 ret = try_to_unmap(p, ttu);
896 if (ret == SWAP_SUCCESS)
897 break;
898 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
899 }
900 906
907 ret = try_to_unmap(hpage, ttu);
901 if (ret != SWAP_SUCCESS) 908 if (ret != SWAP_SUCCESS)
902 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 909 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
903 pfn, page_mapcount(p)); 910 pfn, page_mapcount(hpage));
904 911
905 /* 912 /*
906 * Now that the dirty bit has been propagated to the 913 * Now that the dirty bit has been propagated to the
@@ -911,17 +918,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
911 * use a more force-full uncatchable kill to prevent 918 * use a more force-full uncatchable kill to prevent
912 * any accesses to the poisoned memory. 919 * any accesses to the poisoned memory.
913 */ 920 */
914 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 921 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
915 ret != SWAP_SUCCESS, pfn); 922 ret != SWAP_SUCCESS, p, pfn);
916 923
917 return ret; 924 return ret;
918} 925}
919 926
927static void set_page_hwpoison_huge_page(struct page *hpage)
928{
929 int i;
930 int nr_pages = 1 << compound_order(hpage);
931 for (i = 0; i < nr_pages; i++)
932 SetPageHWPoison(hpage + i);
933}
934
935static void clear_page_hwpoison_huge_page(struct page *hpage)
936{
937 int i;
938 int nr_pages = 1 << compound_order(hpage);
939 for (i = 0; i < nr_pages; i++)
940 ClearPageHWPoison(hpage + i);
941}
942
920int __memory_failure(unsigned long pfn, int trapno, int flags) 943int __memory_failure(unsigned long pfn, int trapno, int flags)
921{ 944{
922 struct page_state *ps; 945 struct page_state *ps;
923 struct page *p; 946 struct page *p;
947 struct page *hpage;
924 int res; 948 int res;
949 unsigned int nr_pages;
925 950
926 if (!sysctl_memory_failure_recovery) 951 if (!sysctl_memory_failure_recovery)
927 panic("Memory failure from trap %d on page %lx", trapno, pfn); 952 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -934,18 +959,23 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
934 } 959 }
935 960
936 p = pfn_to_page(pfn); 961 p = pfn_to_page(pfn);
962 hpage = compound_head(p);
937 if (TestSetPageHWPoison(p)) { 963 if (TestSetPageHWPoison(p)) {
938 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 964 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
939 return 0; 965 return 0;
940 } 966 }
941 967
942 atomic_long_add(1, &mce_bad_pages); 968 nr_pages = 1 << compound_order(hpage);
969 atomic_long_add(nr_pages, &mce_bad_pages);
943 970
944 /* 971 /*
945 * We need/can do nothing about count=0 pages. 972 * We need/can do nothing about count=0 pages.
946 * 1) it's a free page, and therefore in safe hand: 973 * 1) it's a free page, and therefore in safe hand:
947 * prep_new_page() will be the gate keeper. 974 * prep_new_page() will be the gate keeper.
948 * 2) it's part of a non-compound high order page. 975 * 2) it's a free hugepage, which is also safe:
976 * an affected hugepage will be dequeued from hugepage freelist,
977 * so there's no concern about reusing it ever after.
978 * 3) it's part of a non-compound high order page.
949 * Implies some kernel user: cannot stop them from 979 * Implies some kernel user: cannot stop them from
950 * R/W the page; let's pray that the page has been 980 * R/W the page; let's pray that the page has been
951 * used and will be freed some time later. 981 * used and will be freed some time later.
@@ -953,10 +983,28 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
953 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 983 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
954 */ 984 */
955 if (!(flags & MF_COUNT_INCREASED) && 985 if (!(flags & MF_COUNT_INCREASED) &&
956 !get_page_unless_zero(compound_head(p))) { 986 !get_page_unless_zero(hpage)) {
957 if (is_free_buddy_page(p)) { 987 if (is_free_buddy_page(p)) {
958 action_result(pfn, "free buddy", DELAYED); 988 action_result(pfn, "free buddy", DELAYED);
959 return 0; 989 return 0;
990 } else if (PageHuge(hpage)) {
991 /*
992 * Check "just unpoisoned", "filter hit", and
993 * "race with other subpage."
994 */
995 lock_page_nosync(hpage);
996 if (!PageHWPoison(hpage)
997 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
998 || (p != hpage && TestSetPageHWPoison(hpage))) {
999 atomic_long_sub(nr_pages, &mce_bad_pages);
1000 return 0;
1001 }
1002 set_page_hwpoison_huge_page(hpage);
1003 res = dequeue_hwpoisoned_huge_page(hpage);
1004 action_result(pfn, "free huge",
1005 res ? IGNORED : DELAYED);
1006 unlock_page(hpage);
1007 return res;
960 } else { 1008 } else {
961 action_result(pfn, "high order kernel", IGNORED); 1009 action_result(pfn, "high order kernel", IGNORED);
962 return -EBUSY; 1010 return -EBUSY;
@@ -971,9 +1019,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
971 * The check (unnecessarily) ignores LRU pages being isolated and 1019 * The check (unnecessarily) ignores LRU pages being isolated and
972 * walked by the page reclaim code, however that's not a big loss. 1020 * walked by the page reclaim code, however that's not a big loss.
973 */ 1021 */
974 if (!PageLRU(p)) 1022 if (!PageLRU(p) && !PageHuge(p))
975 shake_page(p, 0); 1023 shake_page(p, 0);
976 if (!PageLRU(p)) { 1024 if (!PageLRU(p) && !PageHuge(p)) {
977 /* 1025 /*
978 * shake_page could have turned it free. 1026 * shake_page could have turned it free.
979 */ 1027 */
@@ -991,7 +1039,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
991 * It's very difficult to mess with pages currently under IO 1039 * It's very difficult to mess with pages currently under IO
992 * and in many cases impossible, so we just avoid it here. 1040 * and in many cases impossible, so we just avoid it here.
993 */ 1041 */
994 lock_page_nosync(p); 1042 lock_page_nosync(hpage);
995 1043
996 /* 1044 /*
997 * unpoison always clear PG_hwpoison inside page lock 1045 * unpoison always clear PG_hwpoison inside page lock
@@ -1003,12 +1051,32 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1003 } 1051 }
1004 if (hwpoison_filter(p)) { 1052 if (hwpoison_filter(p)) {
1005 if (TestClearPageHWPoison(p)) 1053 if (TestClearPageHWPoison(p))
1006 atomic_long_dec(&mce_bad_pages); 1054 atomic_long_sub(nr_pages, &mce_bad_pages);
1007 unlock_page(p); 1055 unlock_page(hpage);
1008 put_page(p); 1056 put_page(hpage);
1009 return 0; 1057 return 0;
1010 } 1058 }
1011 1059
1060 /*
1061 * For error on the tail page, we should set PG_hwpoison
1062 * on the head page to show that the hugepage is hwpoisoned
1063 */
1064 if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1065 action_result(pfn, "hugepage already hardware poisoned",
1066 IGNORED);
1067 unlock_page(hpage);
1068 put_page(hpage);
1069 return 0;
1070 }
1071 /*
1072 * Set PG_hwpoison on all pages in an error hugepage,
1073 * because containment is done in hugepage unit for now.
1074 * Since we have done TestSetPageHWPoison() for the head page with
1075 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1076 */
1077 if (PageHuge(p))
1078 set_page_hwpoison_huge_page(hpage);
1079
1012 wait_on_page_writeback(p); 1080 wait_on_page_writeback(p);
1013 1081
1014 /* 1082 /*
@@ -1038,7 +1106,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1038 } 1106 }
1039 } 1107 }
1040out: 1108out:
1041 unlock_page(p); 1109 unlock_page(hpage);
1042 return res; 1110 return res;
1043} 1111}
1044EXPORT_SYMBOL_GPL(__memory_failure); 1112EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1082,6 +1150,7 @@ int unpoison_memory(unsigned long pfn)
1082 struct page *page; 1150 struct page *page;
1083 struct page *p; 1151 struct page *p;
1084 int freeit = 0; 1152 int freeit = 0;
1153 unsigned int nr_pages;
1085 1154
1086 if (!pfn_valid(pfn)) 1155 if (!pfn_valid(pfn))
1087 return -ENXIO; 1156 return -ENXIO;
@@ -1090,14 +1159,26 @@ int unpoison_memory(unsigned long pfn)
1090 page = compound_head(p); 1159 page = compound_head(p);
1091 1160
1092 if (!PageHWPoison(p)) { 1161 if (!PageHWPoison(p)) {
1093 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1162 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1094 return 0; 1163 return 0;
1095 } 1164 }
1096 1165
1166 nr_pages = 1 << compound_order(page);
1167
1097 if (!get_page_unless_zero(page)) { 1168 if (!get_page_unless_zero(page)) {
1169 /*
1170 * Since HWPoisoned hugepage should have non-zero refcount,
1171 * race between memory failure and unpoison seems to happen.
1172 * In such case unpoison fails and memory failure runs
1173 * to the end.
1174 */
1175 if (PageHuge(page)) {
1176 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1177 return 0;
1178 }
1098 if (TestClearPageHWPoison(p)) 1179 if (TestClearPageHWPoison(p))
1099 atomic_long_dec(&mce_bad_pages); 1180 atomic_long_sub(nr_pages, &mce_bad_pages);
1100 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1181 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1101 return 0; 1182 return 0;
1102 } 1183 }
1103 1184
@@ -1108,10 +1189,12 @@ int unpoison_memory(unsigned long pfn)
1108 * the PG_hwpoison page will be caught and isolated on the entrance to 1189 * the PG_hwpoison page will be caught and isolated on the entrance to
1109 * the free buddy page pool. 1190 * the free buddy page pool.
1110 */ 1191 */
1111 if (TestClearPageHWPoison(p)) { 1192 if (TestClearPageHWPoison(page)) {
1112 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1193 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1113 atomic_long_dec(&mce_bad_pages); 1194 atomic_long_sub(nr_pages, &mce_bad_pages);
1114 freeit = 1; 1195 freeit = 1;
1196 if (PageHuge(page))
1197 clear_page_hwpoison_huge_page(page);
1115 } 1198 }
1116 unlock_page(page); 1199 unlock_page(page);
1117 1200
@@ -1126,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
1126static struct page *new_page(struct page *p, unsigned long private, int **x) 1209static struct page *new_page(struct page *p, unsigned long private, int **x)
1127{ 1210{
1128 int nid = page_to_nid(p); 1211 int nid = page_to_nid(p);
1129 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1212 if (PageHuge(p))
1213 return alloc_huge_page_node(page_hstate(compound_head(p)),
1214 nid);
1215 else
1216 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1130} 1217}
1131 1218
1132/* 1219/*
@@ -1154,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1154 * was free. 1241 * was free.
1155 */ 1242 */
1156 set_migratetype_isolate(p); 1243 set_migratetype_isolate(p);
1244 /*
1245 * When the target page is a free hugepage, just remove it
1246 * from free hugepage list.
1247 */
1157 if (!get_page_unless_zero(compound_head(p))) { 1248 if (!get_page_unless_zero(compound_head(p))) {
1158 if (is_free_buddy_page(p)) { 1249 if (PageHuge(p)) {
1159 pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1250 pr_info("get_any_page: %#lx free huge page\n", pfn);
1251 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1252 } else if (is_free_buddy_page(p)) {
1253 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1160 /* Set hwpoison bit while page is still isolated */ 1254 /* Set hwpoison bit while page is still isolated */
1161 SetPageHWPoison(p); 1255 SetPageHWPoison(p);
1162 ret = 0; 1256 ret = 0;
1163 } else { 1257 } else {
1164 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1258 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1165 pfn, p->flags); 1259 pfn, p->flags);
1166 ret = -EIO; 1260 ret = -EIO;
1167 } 1261 }
@@ -1174,6 +1268,46 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1174 return ret; 1268 return ret;
1175} 1269}
1176 1270
1271static int soft_offline_huge_page(struct page *page, int flags)
1272{
1273 int ret;
1274 unsigned long pfn = page_to_pfn(page);
1275 struct page *hpage = compound_head(page);
1276 LIST_HEAD(pagelist);
1277
1278 ret = get_any_page(page, pfn, flags);
1279 if (ret < 0)
1280 return ret;
1281 if (ret == 0)
1282 goto done;
1283
1284 if (PageHWPoison(hpage)) {
1285 put_page(hpage);
1286 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1287 return -EBUSY;
1288 }
1289
1290 /* Keep page count to indicate a given hugepage is isolated. */
1291
1292 list_add(&hpage->lru, &pagelist);
1293 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1294 if (ret) {
1295 putback_lru_pages(&pagelist);
1296 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1297 pfn, ret, page->flags);
1298 if (ret > 0)
1299 ret = -EIO;
1300 return ret;
1301 }
1302done:
1303 if (!PageHWPoison(hpage))
1304 atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
1305 set_page_hwpoison_huge_page(hpage);
1306 dequeue_hwpoisoned_huge_page(hpage);
1307 /* keep elevated page count for bad page */
1308 return ret;
1309}
1310
1177/** 1311/**
1178 * soft_offline_page - Soft offline a page. 1312 * soft_offline_page - Soft offline a page.
1179 * @page: page to offline 1313 * @page: page to offline
@@ -1201,6 +1335,9 @@ int soft_offline_page(struct page *page, int flags)
1201 int ret; 1335 int ret;
1202 unsigned long pfn = page_to_pfn(page); 1336 unsigned long pfn = page_to_pfn(page);
1203 1337
1338 if (PageHuge(page))
1339 return soft_offline_huge_page(page, flags);
1340
1204 ret = get_any_page(page, pfn, flags); 1341 ret = get_any_page(page, pfn, flags);
1205 if (ret < 0) 1342 if (ret < 0)
1206 return ret; 1343 return ret;
@@ -1227,7 +1364,7 @@ int soft_offline_page(struct page *page, int flags)
1227 goto done; 1364 goto done;
1228 } 1365 }
1229 if (!PageLRU(page)) { 1366 if (!PageLRU(page)) {
1230 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1367 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1231 pfn, page->flags); 1368 pfn, page->flags);
1232 return -EIO; 1369 return -EIO;
1233 } 1370 }
@@ -1241,7 +1378,7 @@ int soft_offline_page(struct page *page, int flags)
1241 if (PageHWPoison(page)) { 1378 if (PageHWPoison(page)) {
1242 unlock_page(page); 1379 unlock_page(page);
1243 put_page(page); 1380 put_page(page);
1244 pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1381 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1245 return -EBUSY; 1382 return -EBUSY;
1246 } 1383 }
1247 1384
@@ -1262,7 +1399,7 @@ int soft_offline_page(struct page *page, int flags)
1262 put_page(page); 1399 put_page(page);
1263 if (ret == 1) { 1400 if (ret == 1) {
1264 ret = 0; 1401 ret = 0;
1265 pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1402 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1266 goto done; 1403 goto done;
1267 } 1404 }
1268 1405
@@ -1278,13 +1415,13 @@ int soft_offline_page(struct page *page, int flags)
1278 list_add(&page->lru, &pagelist); 1415 list_add(&page->lru, &pagelist);
1279 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1416 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1280 if (ret) { 1417 if (ret) {
1281 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1418 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1282 pfn, ret, page->flags); 1419 pfn, ret, page->flags);
1283 if (ret > 0) 1420 if (ret > 0)
1284 ret = -EIO; 1421 ret = -EIO;
1285 } 1422 }
1286 } else { 1423 } else {
1287 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1424 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1288 pfn, ret, page_count(page), page->flags); 1425 pfn, ret, page_count(page), page->flags);
1289 } 1426 }
1290 if (ret) 1427 if (ret)
@@ -1296,3 +1433,35 @@ done:
1296 /* keep elevated page count for bad page */ 1433 /* keep elevated page count for bad page */
1297 return ret; 1434 return ret;
1298} 1435}
1436
1437/*
1438 * The caller must hold current->mm->mmap_sem in read mode.
1439 */
1440int is_hwpoison_address(unsigned long addr)
1441{
1442 pgd_t *pgdp;
1443 pud_t pud, *pudp;
1444 pmd_t pmd, *pmdp;
1445 pte_t pte, *ptep;
1446 swp_entry_t entry;
1447
1448 pgdp = pgd_offset(current->mm, addr);
1449 if (!pgd_present(*pgdp))
1450 return 0;
1451 pudp = pud_offset(pgdp, addr);
1452 pud = *pudp;
1453 if (!pud_present(pud) || pud_large(pud))
1454 return 0;
1455 pmdp = pmd_offset(pudp, addr);
1456 pmd = *pmdp;
1457 if (!pmd_present(pmd) || pmd_large(pmd))
1458 return 0;
1459 ptep = pte_offset_map(pmdp, addr);
1460 pte = *ptep;
1461 pte_unmap(ptep);
1462 if (!is_swap_pte(pte))
1463 return 0;
1464 entry = pte_to_swp_entry(pte);
1465 return is_hwpoison_entry(entry);
1466}
1467EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d363..02e48aa0ed1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
307{ 307{
308 pgd_t *pgd; 308 pgd_t *pgd;
309 unsigned long next; 309 unsigned long next;
310 unsigned long start;
311 310
312 /* 311 /*
313 * The next few lines have given us lots of grief... 312 * The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
351 if (addr > end - 1) 350 if (addr > end - 1)
352 return; 351 return;
353 352
354 start = addr;
355 pgd = pgd_offset(tlb->mm, addr); 353 pgd = pgd_offset(tlb->mm, addr);
356 do { 354 do {
357 next = pgd_addr_end(addr, end); 355 next = pgd_addr_end(addr, end);
@@ -738,7 +736,7 @@ again:
738 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 736 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
739 if (!dst_pte) 737 if (!dst_pte)
740 return -ENOMEM; 738 return -ENOMEM;
741 src_pte = pte_offset_map_nested(src_pmd, addr); 739 src_pte = pte_offset_map(src_pmd, addr);
742 src_ptl = pte_lockptr(src_mm, src_pmd); 740 src_ptl = pte_lockptr(src_mm, src_pmd);
743 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 741 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
744 orig_src_pte = src_pte; 742 orig_src_pte = src_pte;
@@ -769,7 +767,7 @@ again:
769 767
770 arch_leave_lazy_mmu_mode(); 768 arch_leave_lazy_mmu_mode();
771 spin_unlock(src_ptl); 769 spin_unlock(src_ptl);
772 pte_unmap_nested(orig_src_pte); 770 pte_unmap(orig_src_pte);
773 add_mm_rss_vec(dst_mm, rss); 771 add_mm_rss_vec(dst_mm, rss);
774 pte_unmap_unlock(orig_dst_pte, dst_ptl); 772 pte_unmap_unlock(orig_dst_pte, dst_ptl);
775 cond_resched(); 773 cond_resched();
@@ -1452,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1452 if (ret & VM_FAULT_OOM) 1450 if (ret & VM_FAULT_OOM)
1453 return i ? i : -ENOMEM; 1451 return i ? i : -ENOMEM;
1454 if (ret & 1452 if (ret &
1455 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) 1453 (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
1454 VM_FAULT_SIGBUS))
1456 return i ? i : -EFAULT; 1455 return i ? i : -EFAULT;
1457 BUG(); 1456 BUG();
1458 } 1457 }
@@ -1592,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr)
1592} 1591}
1593#endif /* CONFIG_ELF_CORE */ 1592#endif /* CONFIG_ELF_CORE */
1594 1593
1595pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1594pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1596 spinlock_t **ptl) 1595 spinlock_t **ptl)
1597{ 1596{
1598 pgd_t * pgd = pgd_offset(mm, addr); 1597 pgd_t * pgd = pgd_offset(mm, addr);
@@ -2008,11 +2007,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2008{ 2007{
2009 pgd_t *pgd; 2008 pgd_t *pgd;
2010 unsigned long next; 2009 unsigned long next;
2011 unsigned long start = addr, end = addr + size; 2010 unsigned long end = addr + size;
2012 int err; 2011 int err;
2013 2012
2014 BUG_ON(addr >= end); 2013 BUG_ON(addr >= end);
2015 mmu_notifier_invalidate_range_start(mm, start, end);
2016 pgd = pgd_offset(mm, addr); 2014 pgd = pgd_offset(mm, addr);
2017 do { 2015 do {
2018 next = pgd_addr_end(addr, end); 2016 next = pgd_addr_end(addr, end);
@@ -2020,7 +2018,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2020 if (err) 2018 if (err)
2021 break; 2019 break;
2022 } while (pgd++, addr = next, addr != end); 2020 } while (pgd++, addr = next, addr != end);
2023 mmu_notifier_invalidate_range_end(mm, start, end); 2021
2024 return err; 2022 return err;
2025} 2023}
2026EXPORT_SYMBOL_GPL(apply_to_page_range); 2024EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2082,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2082 * zeroes. 2080 * zeroes.
2083 */ 2081 */
2084 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2082 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2085 memset(kaddr, 0, PAGE_SIZE); 2083 clear_page(kaddr);
2086 kunmap_atomic(kaddr, KM_USER0); 2084 kunmap_atomic(kaddr, KM_USER0);
2087 flush_dcache_page(dst); 2085 flush_dcache_page(dst);
2088 } else 2086 } else
@@ -2110,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2110static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2108static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2111 unsigned long address, pte_t *page_table, pmd_t *pmd, 2109 unsigned long address, pte_t *page_table, pmd_t *pmd,
2112 spinlock_t *ptl, pte_t orig_pte) 2110 spinlock_t *ptl, pte_t orig_pte)
2111 __releases(ptl)
2113{ 2112{
2114 struct page *old_page, *new_page; 2113 struct page *old_page, *new_page;
2115 pte_t entry; 2114 pte_t entry;
@@ -2626,10 +2625,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2626 unsigned int flags, pte_t orig_pte) 2625 unsigned int flags, pte_t orig_pte)
2627{ 2626{
2628 spinlock_t *ptl; 2627 spinlock_t *ptl;
2629 struct page *page; 2628 struct page *page, *swapcache = NULL;
2630 swp_entry_t entry; 2629 swp_entry_t entry;
2631 pte_t pte; 2630 pte_t pte;
2631 int locked;
2632 struct mem_cgroup *ptr = NULL; 2632 struct mem_cgroup *ptr = NULL;
2633 int exclusive = 0;
2633 int ret = 0; 2634 int ret = 0;
2634 2635
2635 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2636 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2678,13 +2679,32 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2678 goto out_release; 2679 goto out_release;
2679 } 2680 }
2680 2681
2681 lock_page(page); 2682 locked = lock_page_or_retry(page, mm, flags);
2682 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2683 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2684 if (!locked) {
2685 ret |= VM_FAULT_RETRY;
2686 goto out_release;
2687 }
2683 2688
2684 page = ksm_might_need_to_copy(page, vma, address); 2689 /*
2685 if (!page) { 2690 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
2686 ret = VM_FAULT_OOM; 2691 * release the swapcache from under us. The page pin, and pte_same
2687 goto out; 2692 * test below, are not enough to exclude that. Even if it is still
2693 * swapcache, we need to check that the page's swap has not changed.
2694 */
2695 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2696 goto out_page;
2697
2698 if (ksm_might_need_to_copy(page, vma, address)) {
2699 swapcache = page;
2700 page = ksm_does_need_to_copy(page, vma, address);
2701
2702 if (unlikely(!page)) {
2703 ret = VM_FAULT_OOM;
2704 page = swapcache;
2705 swapcache = NULL;
2706 goto out_page;
2707 }
2688 } 2708 }
2689 2709
2690 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2710 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2724,10 +2744,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2724 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2744 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2725 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2745 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2726 flags &= ~FAULT_FLAG_WRITE; 2746 flags &= ~FAULT_FLAG_WRITE;
2747 ret |= VM_FAULT_WRITE;
2748 exclusive = 1;
2727 } 2749 }
2728 flush_icache_page(vma, page); 2750 flush_icache_page(vma, page);
2729 set_pte_at(mm, address, page_table, pte); 2751 set_pte_at(mm, address, page_table, pte);
2730 page_add_anon_rmap(page, vma, address); 2752 do_page_add_anon_rmap(page, vma, address, exclusive);
2731 /* It's better to call commit-charge after rmap is established */ 2753 /* It's better to call commit-charge after rmap is established */
2732 mem_cgroup_commit_charge_swapin(page, ptr); 2754 mem_cgroup_commit_charge_swapin(page, ptr);
2733 2755
@@ -2735,6 +2757,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2735 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2757 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2736 try_to_free_swap(page); 2758 try_to_free_swap(page);
2737 unlock_page(page); 2759 unlock_page(page);
2760 if (swapcache) {
2761 /*
2762 * Hold the lock to avoid the swap entry to be reused
2763 * until we take the PT lock for the pte_same() check
2764 * (to avoid false positives from pte_same). For
2765 * further safety release the lock after the swap_free
2766 * so that the swap count won't change under a
2767 * parallel locked swapcache.
2768 */
2769 unlock_page(swapcache);
2770 page_cache_release(swapcache);
2771 }
2738 2772
2739 if (flags & FAULT_FLAG_WRITE) { 2773 if (flags & FAULT_FLAG_WRITE) {
2740 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2774 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2756,10 +2790,48 @@ out_page:
2756 unlock_page(page); 2790 unlock_page(page);
2757out_release: 2791out_release:
2758 page_cache_release(page); 2792 page_cache_release(page);
2793 if (swapcache) {
2794 unlock_page(swapcache);
2795 page_cache_release(swapcache);
2796 }
2759 return ret; 2797 return ret;
2760} 2798}
2761 2799
2762/* 2800/*
2801 * This is like a special single-page "expand_{down|up}wards()",
2802 * except we must first make sure that 'address{-|+}PAGE_SIZE'
2803 * doesn't hit another vma.
2804 */
2805static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2806{
2807 address &= PAGE_MASK;
2808 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2809 struct vm_area_struct *prev = vma->vm_prev;
2810
2811 /*
2812 * Is there a mapping abutting this one below?
2813 *
2814 * That's only ok if it's the same stack mapping
2815 * that has gotten split..
2816 */
2817 if (prev && prev->vm_end == address)
2818 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2819
2820 expand_stack(vma, address - PAGE_SIZE);
2821 }
2822 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2823 struct vm_area_struct *next = vma->vm_next;
2824
2825 /* As VM_GROWSDOWN but s/below/above/ */
2826 if (next && next->vm_start == address + PAGE_SIZE)
2827 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2828
2829 expand_upwards(vma, address + PAGE_SIZE);
2830 }
2831 return 0;
2832}
2833
2834/*
2763 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2835 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2764 * but allow concurrent faults), and pte mapped but not yet locked. 2836 * but allow concurrent faults), and pte mapped but not yet locked.
2765 * We return with mmap_sem still held, but pte unmapped and unlocked. 2837 * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2772,19 +2844,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2772 spinlock_t *ptl; 2844 spinlock_t *ptl;
2773 pte_t entry; 2845 pte_t entry;
2774 2846
2847 pte_unmap(page_table);
2848
2849 /* Check if we need to add a guard page to the stack */
2850 if (check_stack_guard_page(vma, address) < 0)
2851 return VM_FAULT_SIGBUS;
2852
2853 /* Use the zero-page for reads */
2775 if (!(flags & FAULT_FLAG_WRITE)) { 2854 if (!(flags & FAULT_FLAG_WRITE)) {
2776 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2855 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2777 vma->vm_page_prot)); 2856 vma->vm_page_prot));
2778 ptl = pte_lockptr(mm, pmd); 2857 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2779 spin_lock(ptl);
2780 if (!pte_none(*page_table)) 2858 if (!pte_none(*page_table))
2781 goto unlock; 2859 goto unlock;
2782 goto setpte; 2860 goto setpte;
2783 } 2861 }
2784 2862
2785 /* Allocate our own private page. */ 2863 /* Allocate our own private page. */
2786 pte_unmap(page_table);
2787
2788 if (unlikely(anon_vma_prepare(vma))) 2864 if (unlikely(anon_vma_prepare(vma)))
2789 goto oom; 2865 goto oom;
2790 page = alloc_zeroed_user_highpage_movable(vma, address); 2866 page = alloc_zeroed_user_highpage_movable(vma, address);
@@ -2857,7 +2933,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2857 vmf.page = NULL; 2933 vmf.page = NULL;
2858 2934
2859 ret = vma->vm_ops->fault(vma, &vmf); 2935 ret = vma->vm_ops->fault(vma, &vmf);
2860 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2936 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
2937 VM_FAULT_RETRY)))
2861 return ret; 2938 return ret;
2862 2939
2863 if (unlikely(PageHWPoison(vmf.page))) { 2940 if (unlikely(PageHWPoison(vmf.page))) {
@@ -3116,7 +3193,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
3116 * with threads. 3193 * with threads.
3117 */ 3194 */
3118 if (flags & FAULT_FLAG_WRITE) 3195 if (flags & FAULT_FLAG_WRITE)
3119 flush_tlb_page(vma, address); 3196 flush_tlb_fix_spurious_fault(vma, address);
3120 } 3197 }
3121unlock: 3198unlock:
3122 pte_unmap_unlock(pte, ptl); 3199 pte_unmap_unlock(pte, ptl);
@@ -3274,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr)
3274 3351
3275#endif /* __HAVE_ARCH_GATE_AREA */ 3352#endif /* __HAVE_ARCH_GATE_AREA */
3276 3353
3277static int follow_pte(struct mm_struct *mm, unsigned long address, 3354static int __follow_pte(struct mm_struct *mm, unsigned long address,
3278 pte_t **ptepp, spinlock_t **ptlp) 3355 pte_t **ptepp, spinlock_t **ptlp)
3279{ 3356{
3280 pgd_t *pgd; 3357 pgd_t *pgd;
@@ -3311,6 +3388,17 @@ out:
3311 return -EINVAL; 3388 return -EINVAL;
3312} 3389}
3313 3390
3391static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3392 pte_t **ptepp, spinlock_t **ptlp)
3393{
3394 int res;
3395
3396 /* (void) is needed to make gcc happy */
3397 (void) __cond_lock(*ptlp,
3398 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3399 return res;
3400}
3401
3314/** 3402/**
3315 * follow_pfn - look up PFN at a user virtual address 3403 * follow_pfn - look up PFN at a user virtual address
3316 * @vma: memory mapping 3404 * @vma: memory mapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a4cfcdc0045..9260314a221 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -584,45 +584,32 @@ static inline int pageblock_free(struct page *page)
584/* Return the start of the next active pageblock after a given page */ 584/* Return the start of the next active pageblock after a given page */
585static struct page *next_active_pageblock(struct page *page) 585static struct page *next_active_pageblock(struct page *page)
586{ 586{
587 int pageblocks_stride;
588
589 /* Ensure the starting page is pageblock-aligned */ 587 /* Ensure the starting page is pageblock-aligned */
590 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 588 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
591 589
592 /* Move forward by at least 1 * pageblock_nr_pages */
593 pageblocks_stride = 1;
594
595 /* If the entire pageblock is free, move to the end of free page */ 590 /* If the entire pageblock is free, move to the end of free page */
596 if (pageblock_free(page)) 591 if (pageblock_free(page)) {
597 pageblocks_stride += page_order(page) - pageblock_order; 592 int order;
593 /* be careful. we don't have locks, page_order can be changed.*/
594 order = page_order(page);
595 if ((order < MAX_ORDER) && (order >= pageblock_order))
596 return page + (1 << order);
597 }
598 598
599 return page + (pageblocks_stride * pageblock_nr_pages); 599 return page + pageblock_nr_pages;
600} 600}
601 601
602/* Checks if this range of memory is likely to be hot-removable. */ 602/* Checks if this range of memory is likely to be hot-removable. */
603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
604{ 604{
605 int type;
606 struct page *page = pfn_to_page(start_pfn); 605 struct page *page = pfn_to_page(start_pfn);
607 struct page *end_page = page + nr_pages; 606 struct page *end_page = page + nr_pages;
608 607
609 /* Check the starting page of each pageblock within the range */ 608 /* Check the starting page of each pageblock within the range */
610 for (; page < end_page; page = next_active_pageblock(page)) { 609 for (; page < end_page; page = next_active_pageblock(page)) {
611 type = get_pageblock_migratetype(page); 610 if (!is_pageblock_removable_nolock(page))
612
613 /*
614 * A pageblock containing MOVABLE or free pages is considered
615 * removable
616 */
617 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
618 return 0;
619
620 /*
621 * A pageblock starting with a PageReserved page is not
622 * considered removable.
623 */
624 if (PageReserved(page))
625 return 0; 611 return 0;
612 cond_resched();
626 } 613 }
627 614
628 /* All pageblocks in the memory block are likely to be hot-removable */ 615 /* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
659 * Scanning pfn is much easier than scanning lru list. 646 * Scanning pfn is much easier than scanning lru list.
660 * Scan pfn from start to end and Find LRU page. 647 * Scan pfn from start to end and Find LRU page.
661 */ 648 */
662int scan_lru_pages(unsigned long start, unsigned long end) 649static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
663{ 650{
664 unsigned long pfn; 651 unsigned long pfn;
665 struct page *page; 652 struct page *page;
@@ -709,29 +696,30 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
709 page_is_file_cache(page)); 696 page_is_file_cache(page));
710 697
711 } else { 698 } else {
712 /* Becasue we don't have big zone->lock. we should
713 check this again here. */
714 if (page_count(page))
715 not_managed++;
716#ifdef CONFIG_DEBUG_VM 699#ifdef CONFIG_DEBUG_VM
717 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 700 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
718 pfn); 701 pfn);
719 dump_page(page); 702 dump_page(page);
720#endif 703#endif
704 /* Becasue we don't have big zone->lock. we should
705 check this again here. */
706 if (page_count(page)) {
707 not_managed++;
708 ret = -EBUSY;
709 break;
710 }
721 } 711 }
722 } 712 }
723 ret = -EBUSY; 713 if (!list_empty(&source)) {
724 if (not_managed) { 714 if (not_managed) {
725 if (!list_empty(&source)) 715 putback_lru_pages(&source);
716 goto out;
717 }
718 /* this function returns # of failed pages */
719 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
720 if (ret)
726 putback_lru_pages(&source); 721 putback_lru_pages(&source);
727 goto out;
728 } 722 }
729 ret = 0;
730 if (list_empty(&source))
731 goto out;
732 /* this function returns # of failed pages */
733 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
734
735out: 723out:
736 return ret; 724 return ret;
737} 725}
@@ -840,7 +828,6 @@ repeat:
840 ret = 0; 828 ret = 0;
841 if (drain) { 829 if (drain) {
842 lru_add_drain_all(); 830 lru_add_drain_all();
843 flush_scheduled_work();
844 cond_resched(); 831 cond_resched();
845 drain_all_pages(); 832 drain_all_pages();
846 } 833 }
@@ -862,7 +849,6 @@ repeat:
862 } 849 }
863 /* drain all zone's lru pagevec, this is asyncronous... */ 850 /* drain all zone's lru pagevec, this is asyncronous... */
864 lru_add_drain_all(); 851 lru_add_drain_all();
865 flush_scheduled_work();
866 yield(); 852 yield();
867 /* drain pcp pages , this is synchrouns. */ 853 /* drain pcp pages , this is synchrouns. */
868 drain_all_pages(); 854 drain_all_pages();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb5..4a57f135b76 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -924,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
924 nodemask_t nmask; 924 nodemask_t nmask;
925 LIST_HEAD(pagelist); 925 LIST_HEAD(pagelist);
926 int err = 0; 926 int err = 0;
927 struct vm_area_struct *vma;
927 928
928 nodes_clear(nmask); 929 nodes_clear(nmask);
929 node_set(source, nmask); 930 node_set(source, nmask);
930 931
931 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 932 vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
932 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 933 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
934 if (IS_ERR(vma))
935 return PTR_ERR(vma);
933 936
934 if (!list_empty(&pagelist)) 937 if (!list_empty(&pagelist)) {
935 err = migrate_pages(&pagelist, new_node_page, dest, 0); 938 err = migrate_pages(&pagelist, new_node_page, dest, 0);
939 if (err)
940 putback_lru_pages(&pagelist);
941 }
936 942
937 return err; 943 return err;
938} 944}
@@ -1147,9 +1153,12 @@ static long do_mbind(unsigned long start, unsigned long len,
1147 1153
1148 err = mbind_range(mm, start, end, new); 1154 err = mbind_range(mm, start, end, new);
1149 1155
1150 if (!list_empty(&pagelist)) 1156 if (!list_empty(&pagelist)) {
1151 nr_failed = migrate_pages(&pagelist, new_vma_page, 1157 nr_failed = migrate_pages(&pagelist, new_vma_page,
1152 (unsigned long)vma, 0); 1158 (unsigned long)vma, 0);
1159 if (nr_failed)
1160 putback_lru_pages(&pagelist);
1161 }
1153 1162
1154 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1163 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1155 err = -EIO; 1164 err = -EIO;
@@ -1275,33 +1284,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1275 const unsigned long __user *, new_nodes) 1284 const unsigned long __user *, new_nodes)
1276{ 1285{
1277 const struct cred *cred = current_cred(), *tcred; 1286 const struct cred *cred = current_cred(), *tcred;
1278 struct mm_struct *mm; 1287 struct mm_struct *mm = NULL;
1279 struct task_struct *task; 1288 struct task_struct *task;
1280 nodemask_t old;
1281 nodemask_t new;
1282 nodemask_t task_nodes; 1289 nodemask_t task_nodes;
1283 int err; 1290 int err;
1291 nodemask_t *old;
1292 nodemask_t *new;
1293 NODEMASK_SCRATCH(scratch);
1284 1294
1285 err = get_nodes(&old, old_nodes, maxnode); 1295 if (!scratch)
1296 return -ENOMEM;
1297
1298 old = &scratch->mask1;
1299 new = &scratch->mask2;
1300
1301 err = get_nodes(old, old_nodes, maxnode);
1286 if (err) 1302 if (err)
1287 return err; 1303 goto out;
1288 1304
1289 err = get_nodes(&new, new_nodes, maxnode); 1305 err = get_nodes(new, new_nodes, maxnode);
1290 if (err) 1306 if (err)
1291 return err; 1307 goto out;
1292 1308
1293 /* Find the mm_struct */ 1309 /* Find the mm_struct */
1294 read_lock(&tasklist_lock); 1310 read_lock(&tasklist_lock);
1295 task = pid ? find_task_by_vpid(pid) : current; 1311 task = pid ? find_task_by_vpid(pid) : current;
1296 if (!task) { 1312 if (!task) {
1297 read_unlock(&tasklist_lock); 1313 read_unlock(&tasklist_lock);
1298 return -ESRCH; 1314 err = -ESRCH;
1315 goto out;
1299 } 1316 }
1300 mm = get_task_mm(task); 1317 mm = get_task_mm(task);
1301 read_unlock(&tasklist_lock); 1318 read_unlock(&tasklist_lock);
1302 1319
1320 err = -EINVAL;
1303 if (!mm) 1321 if (!mm)
1304 return -EINVAL; 1322 goto out;
1305 1323
1306 /* 1324 /*
1307 * Check if this process has the right to modify the specified 1325 * Check if this process has the right to modify the specified
@@ -1322,12 +1340,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1322 1340
1323 task_nodes = cpuset_mems_allowed(task); 1341 task_nodes = cpuset_mems_allowed(task);
1324 /* Is the user allowed to access the target nodes? */ 1342 /* Is the user allowed to access the target nodes? */
1325 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { 1343 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1326 err = -EPERM; 1344 err = -EPERM;
1327 goto out; 1345 goto out;
1328 } 1346 }
1329 1347
1330 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { 1348 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1331 err = -EINVAL; 1349 err = -EINVAL;
1332 goto out; 1350 goto out;
1333 } 1351 }
@@ -1336,10 +1354,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1336 if (err) 1354 if (err)
1337 goto out; 1355 goto out;
1338 1356
1339 err = do_migrate_pages(mm, &old, &new, 1357 err = do_migrate_pages(mm, old, new,
1340 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1358 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1341out: 1359out:
1342 mmput(mm); 1360 if (mm)
1361 mmput(mm);
1362 NODEMASK_SCRATCH_FREE(scratch);
1363
1343 return err; 1364 return err;
1344} 1365}
1345 1366
@@ -1576,7 +1597,7 @@ unsigned slab_node(struct mempolicy *policy)
1576 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1597 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1577 &policy->v.nodes, 1598 &policy->v.nodes,
1578 &zone); 1599 &zone);
1579 return zone->node; 1600 return zone ? zone->node : numa_node_id();
1580 } 1601 }
1581 1602
1582 default: 1603 default:
@@ -1712,6 +1733,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1712} 1733}
1713#endif 1734#endif
1714 1735
1736/*
1737 * mempolicy_nodemask_intersects
1738 *
1739 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1740 * policy. Otherwise, check for intersection between mask and the policy
1741 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1742 * policy, always return true since it may allocate elsewhere on fallback.
1743 *
1744 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1745 */
1746bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1747 const nodemask_t *mask)
1748{
1749 struct mempolicy *mempolicy;
1750 bool ret = true;
1751
1752 if (!mask)
1753 return ret;
1754 task_lock(tsk);
1755 mempolicy = tsk->mempolicy;
1756 if (!mempolicy)
1757 goto out;
1758
1759 switch (mempolicy->mode) {
1760 case MPOL_PREFERRED:
1761 /*
1762 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1763 * allocate from, they may fallback to other nodes when oom.
1764 * Thus, it's possible for tsk to have allocated memory from
1765 * nodes in mask.
1766 */
1767 break;
1768 case MPOL_BIND:
1769 case MPOL_INTERLEAVE:
1770 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1771 break;
1772 default:
1773 BUG();
1774 }
1775out:
1776 task_unlock(tsk);
1777 return ret;
1778}
1779
1715/* Allocate a page in interleaved policy. 1780/* Allocate a page in interleaved policy.
1716 Own path because it needs to do special accounting. */ 1781 Own path because it needs to do special accounting. */
1717static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1782static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049..fe5a3c6a542 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
35#include <linux/gfp.h> 36#include <linux/gfp.h>
36 37
37#include "internal.h" 38#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
95 pte_t *ptep, pte; 96 pte_t *ptep, pte;
96 spinlock_t *ptl; 97 spinlock_t *ptl;
97 98
98 pgd = pgd_offset(mm, addr); 99 if (unlikely(PageHuge(new))) {
99 if (!pgd_present(*pgd)) 100 ptep = huge_pte_offset(mm, addr);
100 goto out; 101 if (!ptep)
102 goto out;
103 ptl = &mm->page_table_lock;
104 } else {
105 pgd = pgd_offset(mm, addr);
106 if (!pgd_present(*pgd))
107 goto out;
101 108
102 pud = pud_offset(pgd, addr); 109 pud = pud_offset(pgd, addr);
103 if (!pud_present(*pud)) 110 if (!pud_present(*pud))
104 goto out; 111 goto out;
105 112
106 pmd = pmd_offset(pud, addr); 113 pmd = pmd_offset(pud, addr);
107 if (!pmd_present(*pmd)) 114 if (!pmd_present(*pmd))
108 goto out; 115 goto out;
109 116
110 ptep = pte_offset_map(pmd, addr); 117 ptep = pte_offset_map(pmd, addr);
111 118
112 if (!is_swap_pte(*ptep)) { 119 if (!is_swap_pte(*ptep)) {
113 pte_unmap(ptep); 120 pte_unmap(ptep);
114 goto out; 121 goto out;
115 } 122 }
123
124 ptl = pte_lockptr(mm, pmd);
125 }
116 126
117 ptl = pte_lockptr(mm, pmd);
118 spin_lock(ptl); 127 spin_lock(ptl);
119 pte = *ptep; 128 pte = *ptep;
120 if (!is_swap_pte(pte)) 129 if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131 if (is_write_migration_entry(entry)) 140 if (is_write_migration_entry(entry))
132 pte = pte_mkwrite(pte); 141 pte = pte_mkwrite(pte);
142#ifdef CONFIG_HUGETLB_PAGE
143 if (PageHuge(new))
144 pte = pte_mkhuge(pte);
145#endif
133 flush_cache_page(vma, addr, pte_pfn(pte)); 146 flush_cache_page(vma, addr, pte_pfn(pte));
134 set_pte_at(mm, addr, ptep, pte); 147 set_pte_at(mm, addr, ptep, pte);
135 148
136 if (PageAnon(new)) 149 if (PageHuge(new)) {
150 if (PageAnon(new))
151 hugepage_add_anon_rmap(new, vma, addr);
152 else
153 page_dup_rmap(new);
154 } else if (PageAnon(new))
137 page_add_anon_rmap(new, vma, addr); 155 page_add_anon_rmap(new, vma, addr);
138 else 156 else
139 page_add_file_rmap(new); 157 page_add_file_rmap(new);
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
276} 294}
277 295
278/* 296/*
297 * The expected number of remaining references is the same as that
298 * of migrate_page_move_mapping().
299 */
300int migrate_huge_page_move_mapping(struct address_space *mapping,
301 struct page *newpage, struct page *page)
302{
303 int expected_count;
304 void **pslot;
305
306 if (!mapping) {
307 if (page_count(page) != 1)
308 return -EAGAIN;
309 return 0;
310 }
311
312 spin_lock_irq(&mapping->tree_lock);
313
314 pslot = radix_tree_lookup_slot(&mapping->page_tree,
315 page_index(page));
316
317 expected_count = 2 + page_has_private(page);
318 if (page_count(page) != expected_count ||
319 (struct page *)radix_tree_deref_slot(pslot) != page) {
320 spin_unlock_irq(&mapping->tree_lock);
321 return -EAGAIN;
322 }
323
324 if (!page_freeze_refs(page, expected_count)) {
325 spin_unlock_irq(&mapping->tree_lock);
326 return -EAGAIN;
327 }
328
329 get_page(newpage);
330
331 radix_tree_replace_slot(pslot, newpage);
332
333 page_unfreeze_refs(page, expected_count);
334
335 __put_page(page);
336
337 spin_unlock_irq(&mapping->tree_lock);
338 return 0;
339}
340
341/*
279 * Copy the page to its new location 342 * Copy the page to its new location
280 */ 343 */
281static void migrate_page_copy(struct page *newpage, struct page *page) 344void migrate_page_copy(struct page *newpage, struct page *page)
282{ 345{
283 copy_highpage(newpage, page); 346 if (PageHuge(page))
347 copy_huge_page(newpage, page);
348 else
349 copy_highpage(newpage, page);
284 350
285 if (PageError(page)) 351 if (PageError(page))
286 SetPageError(newpage); 352 SetPageError(newpage);
@@ -431,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page)
431 .nr_to_write = 1, 497 .nr_to_write = 1,
432 .range_start = 0, 498 .range_start = 0,
433 .range_end = LLONG_MAX, 499 .range_end = LLONG_MAX,
434 .nonblocking = 1,
435 .for_reclaim = 1 500 .for_reclaim = 1
436 }; 501 };
437 int rc; 502 int rc;
@@ -639,7 +704,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639 * exist when the page is remapped later 704 * exist when the page is remapped later
640 */ 705 */
641 anon_vma = page_anon_vma(page); 706 anon_vma = page_anon_vma(page);
642 atomic_inc(&anon_vma->external_refcount); 707 get_anon_vma(anon_vma);
643 } 708 }
644 } 709 }
645 710
@@ -682,12 +747,8 @@ skip_unmap:
682rcu_unlock: 747rcu_unlock:
683 748
684 /* Drop an anon_vma reference if we took one */ 749 /* Drop an anon_vma reference if we took one */
685 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { 750 if (anon_vma)
686 int empty = list_empty(&anon_vma->head); 751 drop_anon_vma(anon_vma);
687 spin_unlock(&anon_vma->lock);
688 if (empty)
689 anon_vma_free(anon_vma);
690 }
691 752
692 if (rcu_locked) 753 if (rcu_locked)
693 rcu_read_unlock(); 754 rcu_read_unlock();
@@ -728,6 +789,92 @@ move_newpage:
728} 789}
729 790
730/* 791/*
792 * Counterpart of unmap_and_move_page() for hugepage migration.
793 *
794 * This function doesn't wait the completion of hugepage I/O
795 * because there is no race between I/O and migration for hugepage.
796 * Note that currently hugepage I/O occurs only in direct I/O
797 * where no lock is held and PG_writeback is irrelevant,
798 * and writeback status of all subpages are counted in the reference
799 * count of the head page (i.e. if all subpages of a 2MB hugepage are
800 * under direct I/O, the reference of the head page is 512 and a bit more.)
801 * This means that when we try to migrate hugepage whose subpages are
802 * doing direct I/O, some references remain after try_to_unmap() and
803 * hugepage migration fails without data corruption.
804 *
805 * There is also no race when direct I/O is issued on the page under migration,
806 * because then pte is replaced with migration swap entry and direct I/O code
807 * will wait in the page fault for migration to complete.
808 */
809static int unmap_and_move_huge_page(new_page_t get_new_page,
810 unsigned long private, struct page *hpage,
811 int force, int offlining)
812{
813 int rc = 0;
814 int *result = NULL;
815 struct page *new_hpage = get_new_page(hpage, private, &result);
816 int rcu_locked = 0;
817 struct anon_vma *anon_vma = NULL;
818
819 if (!new_hpage)
820 return -ENOMEM;
821
822 rc = -EAGAIN;
823
824 if (!trylock_page(hpage)) {
825 if (!force)
826 goto out;
827 lock_page(hpage);
828 }
829
830 if (PageAnon(hpage)) {
831 rcu_read_lock();
832 rcu_locked = 1;
833
834 if (page_mapped(hpage)) {
835 anon_vma = page_anon_vma(hpage);
836 atomic_inc(&anon_vma->external_refcount);
837 }
838 }
839
840 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
841
842 if (!page_mapped(hpage))
843 rc = move_to_new_page(new_hpage, hpage, 1);
844
845 if (rc)
846 remove_migration_ptes(hpage, hpage);
847
848 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
849 &anon_vma->lock)) {
850 int empty = list_empty(&anon_vma->head);
851 spin_unlock(&anon_vma->lock);
852 if (empty)
853 anon_vma_free(anon_vma);
854 }
855
856 if (rcu_locked)
857 rcu_read_unlock();
858out:
859 unlock_page(hpage);
860
861 if (rc != -EAGAIN) {
862 list_del(&hpage->lru);
863 put_page(hpage);
864 }
865
866 put_page(new_hpage);
867
868 if (result) {
869 if (rc)
870 *result = rc;
871 else
872 *result = page_to_nid(new_hpage);
873 }
874 return rc;
875}
876
877/*
731 * migrate_pages 878 * migrate_pages
732 * 879 *
733 * The function takes one list of pages to migrate and a function 880 * The function takes one list of pages to migrate and a function
@@ -736,8 +883,9 @@ move_newpage:
736 * 883 *
737 * The function returns after 10 attempts or if no pages 884 * The function returns after 10 attempts or if no pages
738 * are movable anymore because to has become empty 885 * are movable anymore because to has become empty
739 * or no retryable pages exist anymore. All pages will be 886 * or no retryable pages exist anymore.
740 * returned to the LRU or freed. 887 * Caller should call putback_lru_pages to return pages to the LRU
888 * or free list.
741 * 889 *
742 * Return: Number of pages not migrated or error code. 890 * Return: Number of pages not migrated or error code.
743 */ 891 */
@@ -784,7 +932,51 @@ out:
784 if (!swapwrite) 932 if (!swapwrite)
785 current->flags &= ~PF_SWAPWRITE; 933 current->flags &= ~PF_SWAPWRITE;
786 934
787 putback_lru_pages(from); 935 if (rc)
936 return rc;
937
938 return nr_failed + retry;
939}
940
941int migrate_huge_pages(struct list_head *from,
942 new_page_t get_new_page, unsigned long private, int offlining)
943{
944 int retry = 1;
945 int nr_failed = 0;
946 int pass = 0;
947 struct page *page;
948 struct page *page2;
949 int rc;
950
951 for (pass = 0; pass < 10 && retry; pass++) {
952 retry = 0;
953
954 list_for_each_entry_safe(page, page2, from, lru) {
955 cond_resched();
956
957 rc = unmap_and_move_huge_page(get_new_page,
958 private, page, pass > 2, offlining);
959
960 switch(rc) {
961 case -ENOMEM:
962 goto out;
963 case -EAGAIN:
964 retry++;
965 break;
966 case 0:
967 break;
968 default:
969 /* Permanent failure */
970 nr_failed++;
971 break;
972 }
973 }
974 }
975 rc = 0;
976out:
977
978 list_for_each_entry_safe(page, page2, from, lru)
979 put_page(page);
788 980
789 if (rc) 981 if (rc)
790 return rc; 982 return rc;
@@ -845,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
845 1037
846 err = -EFAULT; 1038 err = -EFAULT;
847 vma = find_vma(mm, pp->addr); 1039 vma = find_vma(mm, pp->addr);
848 if (!vma || !vma_migratable(vma)) 1040 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
849 goto set_status; 1041 goto set_status;
850 1042
851 page = follow_page(vma, pp->addr, FOLL_GET); 1043 page = follow_page(vma, pp->addr, FOLL_GET);
@@ -894,9 +1086,12 @@ set_status:
894 } 1086 }
895 1087
896 err = 0; 1088 err = 0;
897 if (!list_empty(&pagelist)) 1089 if (!list_empty(&pagelist)) {
898 err = migrate_pages(&pagelist, new_page_node, 1090 err = migrate_pages(&pagelist, new_page_node,
899 (unsigned long)pm, 0); 1091 (unsigned long)pm, 0);
1092 if (err)
1093 putback_lru_pages(&pagelist);
1094 }
900 1095
901 up_read(&mm->mmap_sem); 1096 up_read(&mm->mmap_sem);
902 return err; 1097 return err;
@@ -1009,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1009 int err = -EFAULT; 1204 int err = -EFAULT;
1010 1205
1011 vma = find_vma(mm, addr); 1206 vma = find_vma(mm, addr);
1012 if (!vma) 1207 if (!vma || addr < vma->vm_start)
1013 goto set_status; 1208 goto set_status;
1014 1209
1015 page = follow_page(vma, addr, 0); 1210 page = follow_page(vma, addr, 0);
diff --git a/mm/mlock.c b/mm/mlock.c
index 3f82720e051..b70919ce4f7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,6 +135,13 @@ void munlock_vma_page(struct page *page)
135 } 135 }
136} 136}
137 137
138static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
139{
140 return (vma->vm_flags & VM_GROWSDOWN) &&
141 (vma->vm_start == addr) &&
142 !vma_stack_continue(vma->vm_prev, addr);
143}
144
138/** 145/**
139 * __mlock_vma_pages_range() - mlock a range of pages in the vma. 146 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
140 * @vma: target vma 147 * @vma: target vma
@@ -167,6 +174,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
167 if (vma->vm_flags & VM_WRITE) 174 if (vma->vm_flags & VM_WRITE)
168 gup_flags |= FOLL_WRITE; 175 gup_flags |= FOLL_WRITE;
169 176
177 /* We don't try to access the guard page of a stack vma */
178 if (stack_guard_page(vma, start)) {
179 addr += PAGE_SIZE;
180 nr_pages--;
181 }
182
170 while (nr_pages > 0) { 183 while (nr_pages > 0) {
171 int i; 184 int i;
172 185
diff --git a/mm/mmap.c b/mm/mmap.c
index 456ec6f2788..b179abb1474 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -388,17 +389,23 @@ static inline void
388__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 389__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
389 struct vm_area_struct *prev, struct rb_node *rb_parent) 390 struct vm_area_struct *prev, struct rb_node *rb_parent)
390{ 391{
392 struct vm_area_struct *next;
393
394 vma->vm_prev = prev;
391 if (prev) { 395 if (prev) {
392 vma->vm_next = prev->vm_next; 396 next = prev->vm_next;
393 prev->vm_next = vma; 397 prev->vm_next = vma;
394 } else { 398 } else {
395 mm->mmap = vma; 399 mm->mmap = vma;
396 if (rb_parent) 400 if (rb_parent)
397 vma->vm_next = rb_entry(rb_parent, 401 next = rb_entry(rb_parent,
398 struct vm_area_struct, vm_rb); 402 struct vm_area_struct, vm_rb);
399 else 403 else
400 vma->vm_next = NULL; 404 next = NULL;
401 } 405 }
406 vma->vm_next = next;
407 if (next)
408 next->vm_prev = vma;
402} 409}
403 410
404void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 411void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -452,12 +459,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
452 spin_lock(&mapping->i_mmap_lock); 459 spin_lock(&mapping->i_mmap_lock);
453 vma->vm_truncate_count = mapping->truncate_count; 460 vma->vm_truncate_count = mapping->truncate_count;
454 } 461 }
455 anon_vma_lock(vma);
456 462
457 __vma_link(mm, vma, prev, rb_link, rb_parent); 463 __vma_link(mm, vma, prev, rb_link, rb_parent);
458 __vma_link_file(vma); 464 __vma_link_file(vma);
459 465
460 anon_vma_unlock(vma);
461 if (mapping) 466 if (mapping)
462 spin_unlock(&mapping->i_mmap_lock); 467 spin_unlock(&mapping->i_mmap_lock);
463 468
@@ -485,7 +490,11 @@ static inline void
485__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 490__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
486 struct vm_area_struct *prev) 491 struct vm_area_struct *prev)
487{ 492{
488 prev->vm_next = vma->vm_next; 493 struct vm_area_struct *next = vma->vm_next;
494
495 prev->vm_next = next;
496 if (next)
497 next->vm_prev = prev;
489 rb_erase(&vma->vm_rb, &mm->mm_rb); 498 rb_erase(&vma->vm_rb, &mm->mm_rb);
490 if (mm->mmap_cache == vma) 499 if (mm->mmap_cache == vma)
491 mm->mmap_cache = prev; 500 mm->mmap_cache = prev;
@@ -506,6 +515,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
506 struct vm_area_struct *importer = NULL; 515 struct vm_area_struct *importer = NULL;
507 struct address_space *mapping = NULL; 516 struct address_space *mapping = NULL;
508 struct prio_tree_root *root = NULL; 517 struct prio_tree_root *root = NULL;
518 struct anon_vma *anon_vma = NULL;
509 struct file *file = vma->vm_file; 519 struct file *file = vma->vm_file;
510 long adjust_next = 0; 520 long adjust_next = 0;
511 int remove_next = 0; 521 int remove_next = 0;
@@ -578,6 +588,17 @@ again: remove_next = 1 + (end > next->vm_end);
578 } 588 }
579 } 589 }
580 590
591 /*
592 * When changing only vma->vm_end, we don't really need anon_vma
593 * lock. This is a fairly rare case by itself, but the anon_vma
594 * lock may be shared between many sibling processes. Skipping
595 * the lock for brk adjustments makes a difference sometimes.
596 */
597 if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
598 anon_vma = vma->anon_vma;
599 anon_vma_lock(anon_vma);
600 }
601
581 if (root) { 602 if (root) {
582 flush_dcache_mmap_lock(mapping); 603 flush_dcache_mmap_lock(mapping);
583 vma_prio_tree_remove(vma, root); 604 vma_prio_tree_remove(vma, root);
@@ -617,6 +638,8 @@ again: remove_next = 1 + (end > next->vm_end);
617 __insert_vm_struct(mm, insert); 638 __insert_vm_struct(mm, insert);
618 } 639 }
619 640
641 if (anon_vma)
642 anon_vma_unlock(anon_vma);
620 if (mapping) 643 if (mapping)
621 spin_unlock(&mapping->i_mmap_lock); 644 spin_unlock(&mapping->i_mmap_lock);
622 645
@@ -1086,6 +1109,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1086 unsigned long retval = -EBADF; 1109 unsigned long retval = -EBADF;
1087 1110
1088 if (!(flags & MAP_ANONYMOUS)) { 1111 if (!(flags & MAP_ANONYMOUS)) {
1112 audit_mmap_fd(fd, flags);
1089 if (unlikely(flags & MAP_HUGETLB)) 1113 if (unlikely(flags & MAP_HUGETLB))
1090 return -EINVAL; 1114 return -EINVAL;
1091 file = fget(fd); 1115 file = fget(fd);
@@ -1694,9 +1718,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1694 * PA-RISC uses this for its stack; IA64 for its Register Backing Store. 1718 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
1695 * vma is the last one with address > vma->vm_end. Have to extend vma. 1719 * vma is the last one with address > vma->vm_end. Have to extend vma.
1696 */ 1720 */
1697#ifndef CONFIG_IA64
1698static
1699#endif
1700int expand_upwards(struct vm_area_struct *vma, unsigned long address) 1721int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1701{ 1722{
1702 int error; 1723 int error;
@@ -1710,7 +1731,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1710 */ 1731 */
1711 if (unlikely(anon_vma_prepare(vma))) 1732 if (unlikely(anon_vma_prepare(vma)))
1712 return -ENOMEM; 1733 return -ENOMEM;
1713 anon_vma_lock(vma); 1734 vma_lock_anon_vma(vma);
1714 1735
1715 /* 1736 /*
1716 * vma->vm_start/vm_end cannot change under us because the caller 1737 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1742,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1721 if (address < PAGE_ALIGN(address+4)) 1742 if (address < PAGE_ALIGN(address+4))
1722 address = PAGE_ALIGN(address+4); 1743 address = PAGE_ALIGN(address+4);
1723 else { 1744 else {
1724 anon_vma_unlock(vma); 1745 vma_unlock_anon_vma(vma);
1725 return -ENOMEM; 1746 return -ENOMEM;
1726 } 1747 }
1727 error = 0; 1748 error = 0;
@@ -1734,10 +1755,12 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1734 grow = (address - vma->vm_end) >> PAGE_SHIFT; 1755 grow = (address - vma->vm_end) >> PAGE_SHIFT;
1735 1756
1736 error = acct_stack_growth(vma, size, grow); 1757 error = acct_stack_growth(vma, size, grow);
1737 if (!error) 1758 if (!error) {
1738 vma->vm_end = address; 1759 vma->vm_end = address;
1760 perf_event_mmap(vma);
1761 }
1739 } 1762 }
1740 anon_vma_unlock(vma); 1763 vma_unlock_anon_vma(vma);
1741 return error; 1764 return error;
1742} 1765}
1743#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1766#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1762,7 +1785,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1762 if (error) 1785 if (error)
1763 return error; 1786 return error;
1764 1787
1765 anon_vma_lock(vma); 1788 vma_lock_anon_vma(vma);
1766 1789
1767 /* 1790 /*
1768 * vma->vm_start/vm_end cannot change under us because the caller 1791 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1781,9 +1804,10 @@ static int expand_downwards(struct vm_area_struct *vma,
1781 if (!error) { 1804 if (!error) {
1782 vma->vm_start = address; 1805 vma->vm_start = address;
1783 vma->vm_pgoff -= grow; 1806 vma->vm_pgoff -= grow;
1807 perf_event_mmap(vma);
1784 } 1808 }
1785 } 1809 }
1786 anon_vma_unlock(vma); 1810 vma_unlock_anon_vma(vma);
1787 return error; 1811 return error;
1788} 1812}
1789 1813
@@ -1900,6 +1924,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1900 unsigned long addr; 1924 unsigned long addr;
1901 1925
1902 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 1926 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1927 vma->vm_prev = NULL;
1903 do { 1928 do {
1904 rb_erase(&vma->vm_rb, &mm->mm_rb); 1929 rb_erase(&vma->vm_rb, &mm->mm_rb);
1905 mm->map_count--; 1930 mm->map_count--;
@@ -1907,6 +1932,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1907 vma = vma->vm_next; 1932 vma = vma->vm_next;
1908 } while (vma && vma->vm_start < end); 1933 } while (vma && vma->vm_start < end);
1909 *insertion_point = vma; 1934 *insertion_point = vma;
1935 if (vma)
1936 vma->vm_prev = prev;
1910 tail_vma->vm_next = NULL; 1937 tail_vma->vm_next = NULL;
1911 if (mm->unmap_area == arch_unmap_area) 1938 if (mm->unmap_area == arch_unmap_area)
1912 addr = prev ? prev->vm_end : mm->mmap_base; 1939 addr = prev ? prev->vm_end : mm->mmap_base;
@@ -1984,6 +2011,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1984 removed_exe_file_vma(mm); 2011 removed_exe_file_vma(mm);
1985 fput(new->vm_file); 2012 fput(new->vm_file);
1986 } 2013 }
2014 unlink_anon_vmas(new);
1987 out_free_mpol: 2015 out_free_mpol:
1988 mpol_put(pol); 2016 mpol_put(pol);
1989 out_free_vma: 2017 out_free_vma:
@@ -2208,6 +2236,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2208 vma->vm_page_prot = vm_get_page_prot(flags); 2236 vma->vm_page_prot = vm_get_page_prot(flags);
2209 vma_link(mm, vma, prev, rb_link, rb_parent); 2237 vma_link(mm, vma, prev, rb_link, rb_parent);
2210out: 2238out:
2239 perf_event_mmap(vma);
2211 mm->total_vm += len >> PAGE_SHIFT; 2240 mm->total_vm += len >> PAGE_SHIFT;
2212 if (flags & VM_LOCKED) { 2241 if (flags & VM_LOCKED) {
2213 if (!mlock_vma_pages_range(vma, addr, addr + len)) 2242 if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -2466,23 +2495,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
2466 2495
2467static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2496static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2468{ 2497{
2469 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { 2498 if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2470 /* 2499 /*
2471 * The LSB of head.next can't change from under us 2500 * The LSB of head.next can't change from under us
2472 * because we hold the mm_all_locks_mutex. 2501 * because we hold the mm_all_locks_mutex.
2473 */ 2502 */
2474 spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); 2503 spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
2475 /* 2504 /*
2476 * We can safely modify head.next after taking the 2505 * We can safely modify head.next after taking the
2477 * anon_vma->lock. If some other vma in this mm shares 2506 * anon_vma->root->lock. If some other vma in this mm shares
2478 * the same anon_vma we won't take it again. 2507 * the same anon_vma we won't take it again.
2479 * 2508 *
2480 * No need of atomic instructions here, head.next 2509 * No need of atomic instructions here, head.next
2481 * can't change from under us thanks to the 2510 * can't change from under us thanks to the
2482 * anon_vma->lock. 2511 * anon_vma->root->lock.
2483 */ 2512 */
2484 if (__test_and_set_bit(0, (unsigned long *) 2513 if (__test_and_set_bit(0, (unsigned long *)
2485 &anon_vma->head.next)) 2514 &anon_vma->root->head.next))
2486 BUG(); 2515 BUG();
2487 } 2516 }
2488} 2517}
@@ -2573,7 +2602,7 @@ out_unlock:
2573 2602
2574static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2603static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2575{ 2604{
2576 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { 2605 if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2577 /* 2606 /*
2578 * The LSB of head.next can't change to 0 from under 2607 * The LSB of head.next can't change to 0 from under
2579 * us because we hold the mm_all_locks_mutex. 2608 * us because we hold the mm_all_locks_mutex.
@@ -2584,12 +2613,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2584 * 2613 *
2585 * No need of atomic instructions here, head.next 2614 * No need of atomic instructions here, head.next
2586 * can't change from under us until we release the 2615 * can't change from under us until we release the
2587 * anon_vma->lock. 2616 * anon_vma->root->lock.
2588 */ 2617 */
2589 if (!__test_and_clear_bit(0, (unsigned long *) 2618 if (!__test_and_clear_bit(0, (unsigned long *)
2590 &anon_vma->head.next)) 2619 &anon_vma->root->head.next))
2591 BUG(); 2620 BUG();
2592 spin_unlock(&anon_vma->lock); 2621 anon_vma_unlock(anon_vma);
2593 } 2622 }
2594} 2623}
2595 2624
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d176021..e35bfb82c85 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef..563fbdd6293 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
101 * pte locks because exclusive mmap_sem prevents deadlock. 101 * pte locks because exclusive mmap_sem prevents deadlock.
102 */ 102 */
103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
104 new_pte = pte_offset_map_nested(new_pmd, new_addr); 104 new_pte = pte_offset_map(new_pmd, new_addr);
105 new_ptl = pte_lockptr(mm, new_pmd); 105 new_ptl = pte_lockptr(mm, new_pmd);
106 if (new_ptl != old_ptl) 106 if (new_ptl != old_ptl)
107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
119 arch_leave_lazy_mmu_mode(); 119 arch_leave_lazy_mmu_mode();
120 if (new_ptl != old_ptl) 120 if (new_ptl != old_ptl)
121 spin_unlock(new_ptl); 121 spin_unlock(new_ptl);
122 pte_unmap_nested(new_pte - 1); 122 pte_unmap(new_pte - 1);
123 pte_unmap_unlock(old_pte - 1, old_ptl); 123 pte_unmap_unlock(old_pte - 1, old_ptl);
124 if (mapping) 124 if (mapping)
125 spin_unlock(&mapping->i_mmap_lock); 125 spin_unlock(&mapping->i_mmap_lock);
diff --git a/mm/nommu.c b/mm/nommu.c
index b76f3ee0abe..3613517c759 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
29#include <linux/personality.h> 29#include <linux/personality.h>
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/audit.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlb.h> 35#include <asm/tlb.h>
@@ -36,11 +37,6 @@
36#include <asm/mmu_context.h> 37#include <asm/mmu_context.h>
37#include "internal.h" 38#include "internal.h"
38 39
39static inline __attribute__((format(printf, 1, 2)))
40void no_printk(const char *fmt, ...)
41{
42}
43
44#if 0 40#if 0
45#define kenter(FMT, ...) \ 41#define kenter(FMT, ...) \
46 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) 42 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -298,11 +294,58 @@ void *vmalloc(unsigned long size)
298} 294}
299EXPORT_SYMBOL(vmalloc); 295EXPORT_SYMBOL(vmalloc);
300 296
297/*
298 * vzalloc - allocate virtually continguos memory with zero fill
299 *
300 * @size: allocation size
301 *
302 * Allocate enough pages to cover @size from the page level
303 * allocator and map them into continguos kernel virtual space.
304 * The memory allocated is set to zero.
305 *
306 * For tight control over page level allocator and protection flags
307 * use __vmalloc() instead.
308 */
309void *vzalloc(unsigned long size)
310{
311 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
312 PAGE_KERNEL);
313}
314EXPORT_SYMBOL(vzalloc);
315
316/**
317 * vmalloc_node - allocate memory on a specific node
318 * @size: allocation size
319 * @node: numa node
320 *
321 * Allocate enough pages to cover @size from the page level
322 * allocator and map them into contiguous kernel virtual space.
323 *
324 * For tight control over page level allocator and protection flags
325 * use __vmalloc() instead.
326 */
301void *vmalloc_node(unsigned long size, int node) 327void *vmalloc_node(unsigned long size, int node)
302{ 328{
303 return vmalloc(size); 329 return vmalloc(size);
304} 330}
305EXPORT_SYMBOL(vmalloc_node); 331
332/**
333 * vzalloc_node - allocate memory on a specific node with zero fill
334 * @size: allocation size
335 * @node: numa node
336 *
337 * Allocate enough pages to cover @size from the page level
338 * allocator and map them into contiguous kernel virtual space.
339 * The memory allocated is set to zero.
340 *
341 * For tight control over page level allocator and protection flags
342 * use __vmalloc() instead.
343 */
344void *vzalloc_node(unsigned long size, int node)
345{
346 return vzalloc(size);
347}
348EXPORT_SYMBOL(vzalloc_node);
306 349
307#ifndef PAGE_KERNEL_EXEC 350#ifndef PAGE_KERNEL_EXEC
308# define PAGE_KERNEL_EXEC PAGE_KERNEL 351# define PAGE_KERNEL_EXEC PAGE_KERNEL
@@ -609,7 +652,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
609 */ 652 */
610static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) 653static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
611{ 654{
612 struct vm_area_struct *pvma, **pp; 655 struct vm_area_struct *pvma, **pp, *next;
613 struct address_space *mapping; 656 struct address_space *mapping;
614 struct rb_node **p, *parent; 657 struct rb_node **p, *parent;
615 658
@@ -669,8 +712,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
669 break; 712 break;
670 } 713 }
671 714
672 vma->vm_next = *pp; 715 next = *pp;
673 *pp = vma; 716 *pp = vma;
717 vma->vm_next = next;
718 if (next)
719 next->vm_prev = vma;
674} 720}
675 721
676/* 722/*
@@ -1413,6 +1459,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1413 struct file *file = NULL; 1459 struct file *file = NULL;
1414 unsigned long retval = -EBADF; 1460 unsigned long retval = -EBADF;
1415 1461
1462 audit_mmap_fd(fd, flags);
1416 if (!(flags & MAP_ANONYMOUS)) { 1463 if (!(flags & MAP_ANONYMOUS)) {
1417 file = fget(fd); 1464 file = fget(fd);
1418 if (!file) 1465 if (!file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 709aedfaa01..7dcca55ede7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
4 * Copyright (C) 1998,2000 Rik van Riel 4 * Copyright (C) 1998,2000 Rik van Riel
5 * Thanks go out to Claus Fischer for some serious inspiration and 5 * Thanks go out to Claus Fischer for some serious inspiration and
6 * for goading me into coding this file... 6 * for goading me into coding this file...
7 * Copyright (C) 2010 Google, Inc.
8 * Rewritten by David Rientjes
7 * 9 *
8 * The routines in this file are used to kill a process when 10 * The routines in this file are used to kill a process when
9 * we're seriously out of memory. This gets called from __alloc_pages() 11 * we're seriously out of memory. This gets called from __alloc_pages()
@@ -27,171 +29,194 @@
27#include <linux/module.h> 29#include <linux/module.h>
28#include <linux/notifier.h> 30#include <linux/notifier.h>
29#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h>
30#include <linux/security.h> 33#include <linux/security.h>
31 34
32int sysctl_panic_on_oom; 35int sysctl_panic_on_oom;
33int sysctl_oom_kill_allocating_task; 36int sysctl_oom_kill_allocating_task;
34int sysctl_oom_dump_tasks; 37int sysctl_oom_dump_tasks = 1;
35static DEFINE_SPINLOCK(zone_scan_lock); 38static DEFINE_SPINLOCK(zone_scan_lock);
36/* #define DEBUG */ 39
40#ifdef CONFIG_NUMA
41/**
42 * has_intersects_mems_allowed() - check task eligiblity for kill
43 * @tsk: task struct of which task to consider
44 * @mask: nodemask passed to page allocator for mempolicy ooms
45 *
46 * Task eligibility is determined by whether or not a candidate task, @tsk,
47 * shares the same mempolicy nodes as current if it is bound by such a policy
48 * and whether or not it has the same set of allowed cpuset nodes.
49 */
50static bool has_intersects_mems_allowed(struct task_struct *tsk,
51 const nodemask_t *mask)
52{
53 struct task_struct *start = tsk;
54
55 do {
56 if (mask) {
57 /*
58 * If this is a mempolicy constrained oom, tsk's
59 * cpuset is irrelevant. Only return true if its
60 * mempolicy intersects current, otherwise it may be
61 * needlessly killed.
62 */
63 if (mempolicy_nodemask_intersects(tsk, mask))
64 return true;
65 } else {
66 /*
67 * This is not a mempolicy constrained oom, so only
68 * check the mems of tsk's cpuset.
69 */
70 if (cpuset_mems_allowed_intersects(current, tsk))
71 return true;
72 }
73 } while_each_thread(start, tsk);
74
75 return false;
76}
77#else
78static bool has_intersects_mems_allowed(struct task_struct *tsk,
79 const nodemask_t *mask)
80{
81 return true;
82}
83#endif /* CONFIG_NUMA */
37 84
38/* 85/*
39 * Is all threads of the target process nodes overlap ours? 86 * If this is a system OOM (not a memcg OOM) and the task selected to be
87 * killed is not already running at high (RT) priorities, speed up the
88 * recovery by boosting the dying task to the lowest FIFO priority.
89 * That helps with the recovery and avoids interfering with RT tasks.
40 */ 90 */
41static int has_intersects_mems_allowed(struct task_struct *tsk) 91static void boost_dying_task_prio(struct task_struct *p,
92 struct mem_cgroup *mem)
42{ 93{
43 struct task_struct *t; 94 struct sched_param param = { .sched_priority = 1 };
95
96 if (mem)
97 return;
98
99 if (!rt_task(p))
100 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
101}
102
103/*
104 * The process p may have detached its own ->mm while exiting or through
105 * use_mm(), but one or more of its subthreads may still have a valid
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with
107 * task_lock() held.
108 */
109struct task_struct *find_lock_task_mm(struct task_struct *p)
110{
111 struct task_struct *t = p;
44 112
45 t = tsk;
46 do { 113 do {
47 if (cpuset_mems_allowed_intersects(current, t)) 114 task_lock(t);
48 return 1; 115 if (likely(t->mm))
49 t = next_thread(t); 116 return t;
50 } while (t != tsk); 117 task_unlock(t);
118 } while_each_thread(p, t);
51 119
52 return 0; 120 return NULL;
121}
122
123/* return true if the task is not adequate as candidate victim task. */
124static bool oom_unkillable_task(struct task_struct *p,
125 const struct mem_cgroup *mem, const nodemask_t *nodemask)
126{
127 if (is_global_init(p))
128 return true;
129 if (p->flags & PF_KTHREAD)
130 return true;
131
132 /* When mem_cgroup_out_of_memory() and p is not member of the group */
133 if (mem && !task_in_mem_cgroup(p, mem))
134 return true;
135
136 /* p may not have freeable memory in nodemask */
137 if (!has_intersects_mems_allowed(p, nodemask))
138 return true;
139
140 return false;
53} 141}
54 142
55/** 143/**
56 * badness - calculate a numeric value for how bad this task has been 144 * oom_badness - heuristic function to determine which candidate task to kill
57 * @p: task struct of which task we should calculate 145 * @p: task struct of which task we should calculate
58 * @uptime: current uptime in seconds 146 * @totalpages: total present RAM allowed for page allocation
59 *
60 * The formula used is relatively simple and documented inline in the
61 * function. The main rationale is that we want to select a good task
62 * to kill when we run out of memory.
63 * 147 *
64 * Good in this context means that: 148 * The heuristic for determining which task to kill is made to be as simple and
65 * 1) we lose the minimum amount of work done 149 * predictable as possible. The goal is to return the highest value for the
66 * 2) we recover a large amount of memory 150 * task consuming the most memory to avoid subsequent oom failures.
67 * 3) we don't kill anything innocent of eating tons of memory
68 * 4) we want to kill the minimum amount of processes (one)
69 * 5) we try to kill the process the user expects us to kill, this
70 * algorithm has been meticulously tuned to meet the principle
71 * of least surprise ... (be careful when you change it)
72 */ 151 */
73 152unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
74unsigned long badness(struct task_struct *p, unsigned long uptime) 153 const nodemask_t *nodemask, unsigned long totalpages)
75{ 154{
76 unsigned long points, cpu_time, run_time; 155 int points;
77 struct mm_struct *mm;
78 struct task_struct *child;
79 int oom_adj = p->signal->oom_adj;
80 struct task_cputime task_time;
81 unsigned long utime;
82 unsigned long stime;
83 156
84 if (oom_adj == OOM_DISABLE) 157 if (oom_unkillable_task(p, mem, nodemask))
85 return 0; 158 return 0;
86 159
87 task_lock(p); 160 p = find_lock_task_mm(p);
88 mm = p->mm; 161 if (!p)
89 if (!mm) {
90 task_unlock(p);
91 return 0; 162 return 0;
92 }
93
94 /*
95 * The memory size of the process is the basis for the badness.
96 */
97 points = mm->total_vm;
98 163
99 /* 164 /*
100 * After this unlock we can no longer dereference local variable `mm' 165 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
166 * so the entire heuristic doesn't need to be executed for something
167 * that cannot be killed.
101 */ 168 */
102 task_unlock(p); 169 if (atomic_read(&p->mm->oom_disable_count)) {
103 170 task_unlock(p);
104 /* 171 return 0;
105 * swapoff can easily use up all memory, so kill those first.
106 */
107 if (p->flags & PF_OOM_ORIGIN)
108 return ULONG_MAX;
109
110 /*
111 * Processes which fork a lot of child processes are likely
112 * a good choice. We add half the vmsize of the children if they
113 * have an own mm. This prevents forking servers to flood the
114 * machine with an endless amount of children. In case a single
115 * child is eating the vast majority of memory, adding only half
116 * to the parents will make the child our kill candidate of choice.
117 */
118 list_for_each_entry(child, &p->children, sibling) {
119 task_lock(child);
120 if (child->mm != mm && child->mm)
121 points += child->mm->total_vm/2 + 1;
122 task_unlock(child);
123 } 172 }
124 173
125 /* 174 /*
126 * CPU time is in tens of seconds and run time is in thousands 175 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
127 * of seconds. There is no particular reason for this other than 176 * priority for oom killing.
128 * that it turned out to work very well in practice.
129 */ 177 */
130 thread_group_cputime(p, &task_time); 178 if (p->flags & PF_OOM_ORIGIN) {
131 utime = cputime_to_jiffies(task_time.utime); 179 task_unlock(p);
132 stime = cputime_to_jiffies(task_time.stime); 180 return 1000;
133 cpu_time = (utime + stime) >> (SHIFT_HZ + 3); 181 }
134
135
136 if (uptime >= p->start_time.tv_sec)
137 run_time = (uptime - p->start_time.tv_sec) >> 10;
138 else
139 run_time = 0;
140
141 if (cpu_time)
142 points /= int_sqrt(cpu_time);
143 if (run_time)
144 points /= int_sqrt(int_sqrt(run_time));
145 182
146 /* 183 /*
147 * Niced processes are most likely less important, so double 184 * The memory controller may have a limit of 0 bytes, so avoid a divide
148 * their badness points. 185 * by zero, if necessary.
149 */ 186 */
150 if (task_nice(p) > 0) 187 if (!totalpages)
151 points *= 2; 188 totalpages = 1;
152 189
153 /* 190 /*
154 * Superuser processes are usually more important, so we make it 191 * The baseline for the badness score is the proportion of RAM that each
155 * less likely that we kill those. 192 * task's rss and swap space use.
156 */ 193 */
157 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 194 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
158 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 195 totalpages;
159 points /= 4; 196 task_unlock(p);
160 197
161 /* 198 /*
162 * We don't want to kill a process with direct hardware access. 199 * Root processes get 3% bonus, just like the __vm_enough_memory()
163 * Not only could that mess up the hardware, but usually users 200 * implementation used by LSMs.
164 * tend to only have this flag set on applications they think
165 * of as important.
166 */ 201 */
167 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 202 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
168 points /= 4; 203 points -= 30;
169 204
170 /* 205 /*
171 * If p's nodes don't overlap ours, it may still help to kill p 206 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
172 * because p may have allocated or otherwise mapped memory on 207 * either completely disable oom killing or always prefer a certain
173 * this node before. However it will be less likely. 208 * task.
174 */ 209 */
175 if (!has_intersects_mems_allowed(p)) 210 points += p->signal->oom_score_adj;
176 points /= 8;
177 211
178 /* 212 /*
179 * Adjust the score by oom_adj. 213 * Never return 0 for an eligible task that may be killed since it's
214 * possible that no single user task uses more than 0.1% of memory and
215 * no single admin tasks uses more than 3.0%.
180 */ 216 */
181 if (oom_adj) { 217 if (points <= 0)
182 if (oom_adj > 0) { 218 return 1;
183 if (!points) 219 return (points < 1000) ? points : 1000;
184 points = 1;
185 points <<= oom_adj;
186 } else
187 points >>= -(oom_adj);
188 }
189
190#ifdef DEBUG
191 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
192 p->pid, p->comm, points);
193#endif
194 return points;
195} 220}
196 221
197/* 222/*
@@ -199,12 +224,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
199 */ 224 */
200#ifdef CONFIG_NUMA 225#ifdef CONFIG_NUMA
201static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 226static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
202 gfp_t gfp_mask, nodemask_t *nodemask) 227 gfp_t gfp_mask, nodemask_t *nodemask,
228 unsigned long *totalpages)
203{ 229{
204 struct zone *zone; 230 struct zone *zone;
205 struct zoneref *z; 231 struct zoneref *z;
206 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 232 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
233 bool cpuset_limited = false;
234 int nid;
207 235
236 /* Default to all available memory */
237 *totalpages = totalram_pages + total_swap_pages;
238
239 if (!zonelist)
240 return CONSTRAINT_NONE;
208 /* 241 /*
209 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 242 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
210 * to kill current.We have to random task kill in this case. 243 * to kill current.We have to random task kill in this case.
@@ -214,26 +247,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
214 return CONSTRAINT_NONE; 247 return CONSTRAINT_NONE;
215 248
216 /* 249 /*
217 * The nodemask here is a nodemask passed to alloc_pages(). Now, 250 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
218 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy 251 * the page allocator means a mempolicy is in effect. Cpuset policy
219 * feature. mempolicy is an only user of nodemask here. 252 * is enforced in get_page_from_freelist().
220 * check mempolicy's nodemask contains all N_HIGH_MEMORY
221 */ 253 */
222 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) 254 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
255 *totalpages = total_swap_pages;
256 for_each_node_mask(nid, *nodemask)
257 *totalpages += node_spanned_pages(nid);
223 return CONSTRAINT_MEMORY_POLICY; 258 return CONSTRAINT_MEMORY_POLICY;
259 }
224 260
225 /* Check this allocation failure is caused by cpuset's wall function */ 261 /* Check this allocation failure is caused by cpuset's wall function */
226 for_each_zone_zonelist_nodemask(zone, z, zonelist, 262 for_each_zone_zonelist_nodemask(zone, z, zonelist,
227 high_zoneidx, nodemask) 263 high_zoneidx, nodemask)
228 if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) 264 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
229 return CONSTRAINT_CPUSET; 265 cpuset_limited = true;
230 266
267 if (cpuset_limited) {
268 *totalpages = total_swap_pages;
269 for_each_node_mask(nid, cpuset_current_mems_allowed)
270 *totalpages += node_spanned_pages(nid);
271 return CONSTRAINT_CPUSET;
272 }
231 return CONSTRAINT_NONE; 273 return CONSTRAINT_NONE;
232} 274}
233#else 275#else
234static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 276static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
235 gfp_t gfp_mask, nodemask_t *nodemask) 277 gfp_t gfp_mask, nodemask_t *nodemask,
278 unsigned long *totalpages)
236{ 279{
280 *totalpages = totalram_pages + total_swap_pages;
237 return CONSTRAINT_NONE; 281 return CONSTRAINT_NONE;
238} 282}
239#endif 283#endif
@@ -244,28 +288,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
244 * 288 *
245 * (not docbooked, we don't want this one cluttering up the manual) 289 * (not docbooked, we don't want this one cluttering up the manual)
246 */ 290 */
247static struct task_struct *select_bad_process(unsigned long *ppoints, 291static struct task_struct *select_bad_process(unsigned int *ppoints,
248 struct mem_cgroup *mem) 292 unsigned long totalpages, struct mem_cgroup *mem,
293 const nodemask_t *nodemask)
249{ 294{
250 struct task_struct *p; 295 struct task_struct *p;
251 struct task_struct *chosen = NULL; 296 struct task_struct *chosen = NULL;
252 struct timespec uptime;
253 *ppoints = 0; 297 *ppoints = 0;
254 298
255 do_posix_clock_monotonic_gettime(&uptime);
256 for_each_process(p) { 299 for_each_process(p) {
257 unsigned long points; 300 unsigned int points;
258 301
259 /* 302 if (oom_unkillable_task(p, mem, nodemask))
260 * skip kernel threads and tasks which have already released
261 * their mm.
262 */
263 if (!p->mm)
264 continue;
265 /* skip the init task */
266 if (is_global_init(p))
267 continue;
268 if (mem && !task_in_mem_cgroup(p, mem))
269 continue; 303 continue;
270 304
271 /* 305 /*
@@ -290,19 +324,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
290 * the process of exiting and releasing its resources. 324 * the process of exiting and releasing its resources.
291 * Otherwise we could get an easy OOM deadlock. 325 * Otherwise we could get an easy OOM deadlock.
292 */ 326 */
293 if (p->flags & PF_EXITING) { 327 if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
294 if (p != current) 328 if (p != current)
295 return ERR_PTR(-1UL); 329 return ERR_PTR(-1UL);
296 330
297 chosen = p; 331 chosen = p;
298 *ppoints = ULONG_MAX; 332 *ppoints = 1000;
299 } 333 }
300 334
301 if (p->signal->oom_adj == OOM_DISABLE) 335 points = oom_badness(p, mem, nodemask, totalpages);
302 continue; 336 if (points > *ppoints) {
303
304 points = badness(p, uptime.tv_sec);
305 if (points > *ppoints || !chosen) {
306 chosen = p; 337 chosen = p;
307 *ppoints = points; 338 *ppoints = points;
308 } 339 }
@@ -313,176 +344,208 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
313 344
314/** 345/**
315 * dump_tasks - dump current memory state of all system tasks 346 * dump_tasks - dump current memory state of all system tasks
316 * @mem: target memory controller 347 * @mem: current's memory controller, if constrained
348 * @nodemask: nodemask passed to page allocator for mempolicy ooms
317 * 349 *
318 * Dumps the current memory state of all system tasks, excluding kernel threads. 350 * Dumps the current memory state of all eligible tasks. Tasks not in the same
351 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
352 * are not shown.
319 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 353 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
320 * score, and name. 354 * value, oom_score_adj value, and name.
321 *
322 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
323 * shown.
324 * 355 *
325 * Call with tasklist_lock read-locked. 356 * Call with tasklist_lock read-locked.
326 */ 357 */
327static void dump_tasks(const struct mem_cgroup *mem) 358static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
328{ 359{
329 struct task_struct *g, *p; 360 struct task_struct *p;
330 361 struct task_struct *task;
331 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
332 "name\n");
333 do_each_thread(g, p) {
334 struct mm_struct *mm;
335 362
336 if (mem && !task_in_mem_cgroup(p, mem)) 363 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
337 continue; 364 for_each_process(p) {
338 if (!thread_group_leader(p)) 365 if (oom_unkillable_task(p, mem, nodemask))
339 continue; 366 continue;
340 367
341 task_lock(p); 368 task = find_lock_task_mm(p);
342 mm = p->mm; 369 if (!task) {
343 if (!mm) {
344 /* 370 /*
345 * total_vm and rss sizes do not exist for tasks with no 371 * This is a kthread or all of p's threads have already
346 * mm so there's no need to report them; they can't be 372 * detached their mm's. There's no need to report
347 * oom killed anyway. 373 * them; they can't be oom killed anyway.
348 */ 374 */
349 task_unlock(p);
350 continue; 375 continue;
351 } 376 }
352 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 377
353 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 378 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
354 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, 379 task->pid, task_uid(task), task->tgid,
355 p->comm); 380 task->mm->total_vm, get_mm_rss(task->mm),
356 task_unlock(p); 381 task_cpu(task), task->signal->oom_adj,
357 } while_each_thread(g, p); 382 task->signal->oom_score_adj, task->comm);
383 task_unlock(task);
384 }
358} 385}
359 386
360static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 387static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
361 struct mem_cgroup *mem) 388 struct mem_cgroup *mem, const nodemask_t *nodemask)
362{ 389{
363 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
364 "oom_adj=%d\n",
365 current->comm, gfp_mask, order, current->signal->oom_adj);
366 task_lock(current); 390 task_lock(current);
391 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
392 "oom_adj=%d, oom_score_adj=%d\n",
393 current->comm, gfp_mask, order, current->signal->oom_adj,
394 current->signal->oom_score_adj);
367 cpuset_print_task_mems_allowed(current); 395 cpuset_print_task_mems_allowed(current);
368 task_unlock(current); 396 task_unlock(current);
369 dump_stack(); 397 dump_stack();
370 mem_cgroup_print_oom_info(mem, p); 398 mem_cgroup_print_oom_info(mem, p);
371 show_mem(); 399 show_mem();
372 if (sysctl_oom_dump_tasks) 400 if (sysctl_oom_dump_tasks)
373 dump_tasks(mem); 401 dump_tasks(mem, nodemask);
374} 402}
375 403
376#define K(x) ((x) << (PAGE_SHIFT-10)) 404#define K(x) ((x) << (PAGE_SHIFT-10))
377 405static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
378/*
379 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
380 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
381 * set.
382 */
383static void __oom_kill_task(struct task_struct *p, int verbose)
384{ 406{
385 if (is_global_init(p)) { 407 struct task_struct *q;
386 WARN_ON(1); 408 struct mm_struct *mm;
387 printk(KERN_WARNING "tried to kill init!\n");
388 return;
389 }
390 409
391 task_lock(p); 410 p = find_lock_task_mm(p);
392 if (!p->mm) { 411 if (!p)
393 WARN_ON(1); 412 return 1;
394 printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", 413
395 task_pid_nr(p), p->comm); 414 /* mm cannot be safely dereferenced after task_unlock(p) */
396 task_unlock(p); 415 mm = p->mm;
397 return;
398 }
399 416
400 if (verbose) 417 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
401 printk(KERN_ERR "Killed process %d (%s) " 418 task_pid_nr(p), p->comm, K(p->mm->total_vm),
402 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 419 K(get_mm_counter(p->mm, MM_ANONPAGES)),
403 task_pid_nr(p), p->comm, 420 K(get_mm_counter(p->mm, MM_FILEPAGES)));
404 K(p->mm->total_vm),
405 K(get_mm_counter(p->mm, MM_ANONPAGES)),
406 K(get_mm_counter(p->mm, MM_FILEPAGES)));
407 task_unlock(p); 421 task_unlock(p);
408 422
409 /* 423 /*
410 * We give our sacrificial lamb high priority and access to 424 * Kill all processes sharing p->mm in other thread groups, if any.
411 * all the memory it needs. That way it should be able to 425 * They don't get access to memory reserves or a higher scheduler
412 * exit() and clear out its resources quickly... 426 * priority, though, to avoid depletion of all memory or task
427 * starvation. This prevents mm->mmap_sem livelock when an oom killed
428 * task cannot exit because it requires the semaphore and its contended
429 * by another thread trying to allocate memory itself. That thread will
430 * now get access to memory reserves since it has a pending fatal
431 * signal.
413 */ 432 */
414 p->rt.time_slice = HZ; 433 for_each_process(q)
415 set_tsk_thread_flag(p, TIF_MEMDIE); 434 if (q->mm == mm && !same_thread_group(q, p)) {
435 task_lock(q); /* Protect ->comm from prctl() */
436 pr_err("Kill process %d (%s) sharing same memory\n",
437 task_pid_nr(q), q->comm);
438 task_unlock(q);
439 force_sig(SIGKILL, q);
440 }
416 441
442 set_tsk_thread_flag(p, TIF_MEMDIE);
417 force_sig(SIGKILL, p); 443 force_sig(SIGKILL, p);
418}
419 444
420static int oom_kill_task(struct task_struct *p) 445 /*
421{ 446 * We give our sacrificial lamb high priority and access to
422 /* WARNING: mm may not be dereferenced since we did not obtain its 447 * all the memory it needs. That way it should be able to
423 * value from get_task_mm(p). This is OK since all we need to do is 448 * exit() and clear out its resources quickly...
424 * compare mm to q->mm below.
425 *
426 * Furthermore, even if mm contains a non-NULL value, p->mm may
427 * change to NULL at any time since we do not hold task_lock(p).
428 * However, this is of no concern to us.
429 */ 449 */
430 if (!p->mm || p->signal->oom_adj == OOM_DISABLE) 450 boost_dying_task_prio(p, mem);
431 return 1;
432
433 __oom_kill_task(p, 1);
434 451
435 return 0; 452 return 0;
436} 453}
454#undef K
437 455
438static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 456static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
439 unsigned long points, struct mem_cgroup *mem, 457 unsigned int points, unsigned long totalpages,
458 struct mem_cgroup *mem, nodemask_t *nodemask,
440 const char *message) 459 const char *message)
441{ 460{
442 struct task_struct *c; 461 struct task_struct *victim = p;
462 struct task_struct *child;
463 struct task_struct *t = p;
464 unsigned int victim_points = 0;
443 465
444 if (printk_ratelimit()) 466 if (printk_ratelimit())
445 dump_header(p, gfp_mask, order, mem); 467 dump_header(p, gfp_mask, order, mem, nodemask);
446 468
447 /* 469 /*
448 * If the task is already exiting, don't alarm the sysadmin or kill 470 * If the task is already exiting, don't alarm the sysadmin or kill
449 * its children or threads, just set TIF_MEMDIE so it can die quickly 471 * its children or threads, just set TIF_MEMDIE so it can die quickly
450 */ 472 */
451 if (p->flags & PF_EXITING) { 473 if (p->flags & PF_EXITING) {
452 __oom_kill_task(p, 0); 474 set_tsk_thread_flag(p, TIF_MEMDIE);
475 boost_dying_task_prio(p, mem);
453 return 0; 476 return 0;
454 } 477 }
455 478
456 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", 479 task_lock(p);
457 message, task_pid_nr(p), p->comm, points); 480 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
481 message, task_pid_nr(p), p->comm, points);
482 task_unlock(p);
483
484 /*
485 * If any of p's children has a different mm and is eligible for kill,
486 * the one with the highest badness() score is sacrificed for its
487 * parent. This attempts to lose the minimal amount of work done while
488 * still freeing memory.
489 */
490 do {
491 list_for_each_entry(child, &t->children, sibling) {
492 unsigned int child_points;
458 493
459 /* Try to kill a child first */ 494 /*
460 list_for_each_entry(c, &p->children, sibling) { 495 * oom_badness() returns 0 if the thread is unkillable
461 if (c->mm == p->mm) 496 */
462 continue; 497 child_points = oom_badness(child, mem, nodemask,
463 if (mem && !task_in_mem_cgroup(c, mem)) 498 totalpages);
464 continue; 499 if (child_points > victim_points) {
465 if (!oom_kill_task(c)) 500 victim = child;
466 return 0; 501 victim_points = child_points;
502 }
503 }
504 } while_each_thread(p, t);
505
506 return oom_kill_task(victim, mem);
507}
508
509/*
510 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
511 */
512static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
513 int order, const nodemask_t *nodemask)
514{
515 if (likely(!sysctl_panic_on_oom))
516 return;
517 if (sysctl_panic_on_oom != 2) {
518 /*
519 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
520 * does not panic for cpuset, mempolicy, or memcg allocation
521 * failures.
522 */
523 if (constraint != CONSTRAINT_NONE)
524 return;
467 } 525 }
468 return oom_kill_task(p); 526 read_lock(&tasklist_lock);
527 dump_header(NULL, gfp_mask, order, NULL, nodemask);
528 read_unlock(&tasklist_lock);
529 panic("Out of memory: %s panic_on_oom is enabled\n",
530 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
469} 531}
470 532
471#ifdef CONFIG_CGROUP_MEM_RES_CTLR 533#ifdef CONFIG_CGROUP_MEM_RES_CTLR
472void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 534void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
473{ 535{
474 unsigned long points = 0; 536 unsigned long limit;
537 unsigned int points = 0;
475 struct task_struct *p; 538 struct task_struct *p;
476 539
477 if (sysctl_panic_on_oom == 2) 540 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
478 panic("out of memory(memcg). panic_on_oom is selected.\n"); 541 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
479 read_lock(&tasklist_lock); 542 read_lock(&tasklist_lock);
480retry: 543retry:
481 p = select_bad_process(&points, mem); 544 p = select_bad_process(&points, limit, mem, NULL);
482 if (!p || PTR_ERR(p) == -1UL) 545 if (!p || PTR_ERR(p) == -1UL)
483 goto out; 546 goto out;
484 547
485 if (oom_kill_process(p, gfp_mask, 0, points, mem, 548 if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
486 "Memory cgroup out of memory")) 549 "Memory cgroup out of memory"))
487 goto retry; 550 goto retry;
488out: 551out:
@@ -509,7 +572,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
509 * if a parallel OOM killing is already taking place that includes a zone in 572 * if a parallel OOM killing is already taking place that includes a zone in
510 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 573 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
511 */ 574 */
512int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) 575int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
513{ 576{
514 struct zoneref *z; 577 struct zoneref *z;
515 struct zone *zone; 578 struct zone *zone;
@@ -526,7 +589,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
526 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 589 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
527 /* 590 /*
528 * Lock each zone in the zonelist under zone_scan_lock so a 591 * Lock each zone in the zonelist under zone_scan_lock so a
529 * parallel invocation of try_set_zone_oom() doesn't succeed 592 * parallel invocation of try_set_zonelist_oom() doesn't succeed
530 * when it shouldn't. 593 * when it shouldn't.
531 */ 594 */
532 zone_set_flag(zone, ZONE_OOM_LOCKED); 595 zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -555,65 +618,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
555} 618}
556 619
557/* 620/*
558 * Must be called with tasklist_lock held for read. 621 * Try to acquire the oom killer lock for all system zones. Returns zero if a
622 * parallel oom killing is taking place, otherwise locks all zones and returns
623 * non-zero.
559 */ 624 */
560static void __out_of_memory(gfp_t gfp_mask, int order) 625static int try_set_system_oom(void)
561{ 626{
562 struct task_struct *p; 627 struct zone *zone;
563 unsigned long points; 628 int ret = 1;
564
565 if (sysctl_oom_kill_allocating_task)
566 if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
567 "Out of memory (oom_kill_allocating_task)"))
568 return;
569retry:
570 /*
571 * Rambo mode: Shoot down a process and hope it solves whatever
572 * issues we may have.
573 */
574 p = select_bad_process(&points, NULL);
575
576 if (PTR_ERR(p) == -1UL)
577 return;
578
579 /* Found nothing?!?! Either we hang forever, or we panic. */
580 if (!p) {
581 read_unlock(&tasklist_lock);
582 dump_header(NULL, gfp_mask, order, NULL);
583 panic("Out of memory and no killable processes...\n");
584 }
585 629
586 if (oom_kill_process(p, gfp_mask, order, points, NULL, 630 spin_lock(&zone_scan_lock);
587 "Out of memory")) 631 for_each_populated_zone(zone)
588 goto retry; 632 if (zone_is_oom_locked(zone)) {
633 ret = 0;
634 goto out;
635 }
636 for_each_populated_zone(zone)
637 zone_set_flag(zone, ZONE_OOM_LOCKED);
638out:
639 spin_unlock(&zone_scan_lock);
640 return ret;
589} 641}
590 642
591/* 643/*
592 * pagefault handler calls into here because it is out of memory but 644 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
593 * doesn't know exactly how or why. 645 * attempts or page faults may now recall the oom killer, if necessary.
594 */ 646 */
595void pagefault_out_of_memory(void) 647static void clear_system_oom(void)
596{ 648{
597 unsigned long freed = 0; 649 struct zone *zone;
598
599 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
600 if (freed > 0)
601 /* Got some memory back in the last second. */
602 return;
603
604 if (sysctl_panic_on_oom)
605 panic("out of memory from page fault. panic_on_oom is selected.\n");
606
607 read_lock(&tasklist_lock);
608 __out_of_memory(0, 0); /* unknown gfp_mask and order */
609 read_unlock(&tasklist_lock);
610 650
611 /* 651 spin_lock(&zone_scan_lock);
612 * Give "p" a good chance of killing itself before we 652 for_each_populated_zone(zone)
613 * retry to allocate memory. 653 zone_clear_flag(zone, ZONE_OOM_LOCKED);
614 */ 654 spin_unlock(&zone_scan_lock);
615 if (!test_thread_flag(TIF_MEMDIE))
616 schedule_timeout_uninterruptible(1);
617} 655}
618 656
619/** 657/**
@@ -621,6 +659,7 @@ void pagefault_out_of_memory(void)
621 * @zonelist: zonelist pointer 659 * @zonelist: zonelist pointer
622 * @gfp_mask: memory allocation flags 660 * @gfp_mask: memory allocation flags
623 * @order: amount of memory being requested as a power of 2 661 * @order: amount of memory being requested as a power of 2
662 * @nodemask: nodemask passed to page allocator
624 * 663 *
625 * If we run out of memory, we have the choice between either 664 * If we run out of memory, we have the choice between either
626 * killing a random task (bad), letting the system crash (worse) 665 * killing a random task (bad), letting the system crash (worse)
@@ -630,49 +669,93 @@ void pagefault_out_of_memory(void)
630void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 669void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
631 int order, nodemask_t *nodemask) 670 int order, nodemask_t *nodemask)
632{ 671{
672 const nodemask_t *mpol_mask;
673 struct task_struct *p;
674 unsigned long totalpages;
633 unsigned long freed = 0; 675 unsigned long freed = 0;
634 enum oom_constraint constraint; 676 unsigned int points;
677 enum oom_constraint constraint = CONSTRAINT_NONE;
678 int killed = 0;
635 679
636 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 680 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
637 if (freed > 0) 681 if (freed > 0)
638 /* Got some memory back in the last second. */ 682 /* Got some memory back in the last second. */
639 return; 683 return;
640 684
641 if (sysctl_panic_on_oom == 2) { 685 /*
642 dump_header(NULL, gfp_mask, order, NULL); 686 * If current has a pending SIGKILL, then automatically select it. The
643 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 687 * goal is to allow it to allocate so that it may quickly exit and free
688 * its memory.
689 */
690 if (fatal_signal_pending(current)) {
691 set_thread_flag(TIF_MEMDIE);
692 boost_dying_task_prio(current, NULL);
693 return;
644 } 694 }
645 695
646 /* 696 /*
647 * Check if there were limitations on the allocation (only relevant for 697 * Check if there were limitations on the allocation (only relevant for
648 * NUMA) that may require different handling. 698 * NUMA) that may require different handling.
649 */ 699 */
650 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 700 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
701 &totalpages);
702 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
703 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
704
651 read_lock(&tasklist_lock); 705 read_lock(&tasklist_lock);
706 if (sysctl_oom_kill_allocating_task &&
707 !oom_unkillable_task(current, NULL, nodemask) &&
708 current->mm && !atomic_read(&current->mm->oom_disable_count)) {
709 /*
710 * oom_kill_process() needs tasklist_lock held. If it returns
711 * non-zero, current could not be killed so we must fallback to
712 * the tasklist scan.
713 */
714 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
715 NULL, nodemask,
716 "Out of memory (oom_kill_allocating_task)"))
717 goto out;
718 }
652 719
653 switch (constraint) { 720retry:
654 case CONSTRAINT_MEMORY_POLICY: 721 p = select_bad_process(&points, totalpages, NULL, mpol_mask);
655 oom_kill_process(current, gfp_mask, order, 0, NULL, 722 if (PTR_ERR(p) == -1UL)
656 "No available memory (MPOL_BIND)"); 723 goto out;
657 break;
658 724
659 case CONSTRAINT_NONE: 725 /* Found nothing?!?! Either we hang forever, or we panic. */
660 if (sysctl_panic_on_oom) { 726 if (!p) {
661 dump_header(NULL, gfp_mask, order, NULL); 727 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
662 panic("out of memory. panic_on_oom is selected\n"); 728 read_unlock(&tasklist_lock);
663 } 729 panic("Out of memory and no killable processes...\n");
664 /* Fall-through */
665 case CONSTRAINT_CPUSET:
666 __out_of_memory(gfp_mask, order);
667 break;
668 } 730 }
669 731
732 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
733 nodemask, "Out of memory"))
734 goto retry;
735 killed = 1;
736out:
670 read_unlock(&tasklist_lock); 737 read_unlock(&tasklist_lock);
671 738
672 /* 739 /*
673 * Give "p" a good chance of killing itself before we 740 * Give "p" a good chance of killing itself before we
674 * retry to allocate memory unless "p" is current 741 * retry to allocate memory unless "p" is current
675 */ 742 */
743 if (killed && !test_thread_flag(TIF_MEMDIE))
744 schedule_timeout_uninterruptible(1);
745}
746
747/*
748 * The pagefault handler calls here because it is out of memory, so kill a
749 * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
750 * oom killing is already in progress so do nothing. If a task is found with
751 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
752 */
753void pagefault_out_of_memory(void)
754{
755 if (try_set_system_oom()) {
756 out_of_memory(NULL, 0, 0, NULL);
757 clear_system_oom();
758 }
676 if (!test_thread_flag(TIF_MEMDIE)) 759 if (!test_thread_flag(TIF_MEMDIE))
677 schedule_timeout_uninterruptible(1); 760 schedule_timeout_uninterruptible(1);
678} 761}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef6154..b840afa8976 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/buffer_head.h> 35#include <linux/buffer_head.h>
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <trace/events/writeback.h>
37 38
38/* 39/*
39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
252 } 253 }
253} 254}
254 255
255/*
256 * Clip the earned share of dirty pages to that which is actually available.
257 * This avoids exceeding the total dirty_limit when the floating averages
258 * fluctuate too quickly.
259 */
260static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
261 unsigned long dirty, unsigned long *pbdi_dirty)
262{
263 unsigned long avail_dirty;
264
265 avail_dirty = global_page_state(NR_FILE_DIRTY) +
266 global_page_state(NR_WRITEBACK) +
267 global_page_state(NR_UNSTABLE_NFS) +
268 global_page_state(NR_WRITEBACK_TEMP);
269
270 if (avail_dirty < dirty)
271 avail_dirty = dirty - avail_dirty;
272 else
273 avail_dirty = 0;
274
275 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
276 bdi_stat(bdi, BDI_WRITEBACK);
277
278 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
279}
280
281static inline void task_dirties_fraction(struct task_struct *tsk, 256static inline void task_dirties_fraction(struct task_struct *tsk,
282 long *numerator, long *denominator) 257 long *numerator, long *denominator)
283{ 258{
@@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
286} 261}
287 262
288/* 263/*
289 * scale the dirty limit 264 * task_dirty_limit - scale down dirty throttling threshold for one task
290 * 265 *
291 * task specific dirty limit: 266 * task specific dirty limit:
292 * 267 *
293 * dirty -= (dirty/8) * p_{t} 268 * dirty -= (dirty/8) * p_{t}
269 *
270 * To protect light/slow dirtying tasks from heavier/fast ones, we start
271 * throttling individual tasks before reaching the bdi dirty limit.
272 * Relatively low thresholds will be allocated to heavy dirtiers. So when
273 * dirty pages grow large, heavy dirtiers will be throttled first, which will
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled.
294 */ 276 */
295static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) 277static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty)
296{ 279{
297 long numerator, denominator; 280 long numerator, denominator;
298 unsigned long dirty = *pdirty; 281 unsigned long dirty = bdi_dirty;
299 u64 inv = dirty >> 3; 282 u64 inv = dirty >> 3;
300 283
301 task_dirties_fraction(tsk, &numerator, &denominator); 284 task_dirties_fraction(tsk, &numerator, &denominator);
@@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
303 do_div(inv, denominator); 286 do_div(inv, denominator);
304 287
305 dirty -= inv; 288 dirty -= inv;
306 if (dirty < *pdirty/2)
307 dirty = *pdirty/2;
308 289
309 *pdirty = dirty; 290 return max(dirty, bdi_dirty/2);
310} 291}
311 292
312/* 293/*
@@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
416 return x + 1; /* Ensure that we never return 0 */ 397 return x + 1; /* Ensure that we never return 0 */
417} 398}
418 399
419void 400/*
420get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, 401 * global_dirty_limits - background-writeback and dirty-throttling thresholds
421 unsigned long *pbdi_dirty, struct backing_dev_info *bdi) 402 *
403 * Calculate the dirty thresholds based on sysctl parameters
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * runtime tasks.
408 */
409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
422{ 410{
423 unsigned long background; 411 unsigned long background;
424 unsigned long dirty; 412 unsigned long dirty;
@@ -427,14 +415,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
427 415
428 if (vm_dirty_bytes) 416 if (vm_dirty_bytes)
429 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
430 else { 418 else
431 int dirty_ratio; 419 dirty = (vm_dirty_ratio * available_memory) / 100;
432
433 dirty_ratio = vm_dirty_ratio;
434 if (dirty_ratio < 5)
435 dirty_ratio = 5;
436 dirty = (dirty_ratio * available_memory) / 100;
437 }
438 420
439 if (dirty_background_bytes) 421 if (dirty_background_bytes)
440 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); 422 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -450,27 +432,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
450 } 432 }
451 *pbackground = background; 433 *pbackground = background;
452 *pdirty = dirty; 434 *pdirty = dirty;
435}
453 436
454 if (bdi) { 437/*
455 u64 bdi_dirty; 438 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
456 long numerator, denominator; 439 *
440 * Allocate high/low dirty limits to fast/slow devices, in order to prevent
441 * - starving fast devices
442 * - piling up dirty pages (that will take long time to sync) on slow devices
443 *
444 * The bdi's share of dirty limit will be adapting to its throughput and
445 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
446 */
447unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
448{
449 u64 bdi_dirty;
450 long numerator, denominator;
457 451
458 /* 452 /*
459 * Calculate this BDI's share of the dirty ratio. 453 * Calculate this BDI's share of the dirty ratio.
460 */ 454 */
461 bdi_writeout_fraction(bdi, &numerator, &denominator); 455 bdi_writeout_fraction(bdi, &numerator, &denominator);
462 456
463 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; 457 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
464 bdi_dirty *= numerator; 458 bdi_dirty *= numerator;
465 do_div(bdi_dirty, denominator); 459 do_div(bdi_dirty, denominator);
466 bdi_dirty += (dirty * bdi->min_ratio) / 100; 460
467 if (bdi_dirty > (dirty * bdi->max_ratio) / 100) 461 bdi_dirty += (dirty * bdi->min_ratio) / 100;
468 bdi_dirty = dirty * bdi->max_ratio / 100; 462 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
469 463 bdi_dirty = dirty * bdi->max_ratio / 100;
470 *pbdi_dirty = bdi_dirty; 464
471 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); 465 return bdi_dirty;
472 task_dirty_limit(current, pbdi_dirty);
473 }
474} 466}
475 467
476/* 468/*
@@ -490,7 +482,7 @@ static void balance_dirty_pages(struct address_space *mapping,
490 unsigned long bdi_thresh; 482 unsigned long bdi_thresh;
491 unsigned long pages_written = 0; 483 unsigned long pages_written = 0;
492 unsigned long pause = 1; 484 unsigned long pause = 1;
493 485 bool dirty_exceeded = false;
494 struct backing_dev_info *bdi = mapping->backing_dev_info; 486 struct backing_dev_info *bdi = mapping->backing_dev_info;
495 487
496 for (;;) { 488 for (;;) {
@@ -501,46 +493,23 @@ static void balance_dirty_pages(struct address_space *mapping,
501 .range_cyclic = 1, 493 .range_cyclic = 1,
502 }; 494 };
503 495
504 get_dirty_limits(&background_thresh, &dirty_thresh,
505 &bdi_thresh, bdi);
506
507 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 496 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
508 global_page_state(NR_UNSTABLE_NFS); 497 global_page_state(NR_UNSTABLE_NFS);
509 nr_writeback = global_page_state(NR_WRITEBACK); 498 nr_writeback = global_page_state(NR_WRITEBACK);
510 499
511 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 500 global_dirty_limits(&background_thresh, &dirty_thresh);
512 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
513
514 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
515 break;
516 501
517 /* 502 /*
518 * Throttle it only when the background writeback cannot 503 * Throttle it only when the background writeback cannot
519 * catch-up. This avoids (excessively) small writeouts 504 * catch-up. This avoids (excessively) small writeouts
520 * when the bdi limits are ramping up. 505 * when the bdi limits are ramping up.
521 */ 506 */
522 if (nr_reclaimable + nr_writeback < 507 if (nr_reclaimable + nr_writeback <=
523 (background_thresh + dirty_thresh) / 2) 508 (background_thresh + dirty_thresh) / 2)
524 break; 509 break;
525 510
526 if (!bdi->dirty_exceeded) 511 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
527 bdi->dirty_exceeded = 1; 512 bdi_thresh = task_dirty_limit(current, bdi_thresh);
528
529 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
530 * Unstable writes are a feature of certain networked
531 * filesystems (i.e. NFS) in which data may have been
532 * written to the server's write cache, but has not yet
533 * been flushed to permanent storage.
534 * Only move pages to writeback if this bdi is over its
535 * threshold otherwise wait until the disk writes catch
536 * up.
537 */
538 if (bdi_nr_reclaimable > bdi_thresh) {
539 writeback_inodes_wb(&bdi->wb, &wbc);
540 pages_written += write_chunk - wbc.nr_to_write;
541 get_dirty_limits(&background_thresh, &dirty_thresh,
542 &bdi_thresh, bdi);
543 }
544 513
545 /* 514 /*
546 * In order to avoid the stacked BDI deadlock we need 515 * In order to avoid the stacked BDI deadlock we need
@@ -555,16 +524,45 @@ static void balance_dirty_pages(struct address_space *mapping,
555 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 524 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
556 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 525 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
557 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 526 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
558 } else if (bdi_nr_reclaimable) { 527 } else {
559 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 528 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
560 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 529 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
561 } 530 }
562 531
563 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) 532 /*
533 * The bdi thresh is somehow "soft" limit derived from the
534 * global "hard" limit. The former helps to prevent heavy IO
535 * bdi or process from holding back light ones; The latter is
536 * the last resort safeguard.
537 */
538 dirty_exceeded =
539 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
540 || (nr_reclaimable + nr_writeback > dirty_thresh);
541
542 if (!dirty_exceeded)
564 break; 543 break;
565 if (pages_written >= write_chunk)
566 break; /* We've done our duty */
567 544
545 if (!bdi->dirty_exceeded)
546 bdi->dirty_exceeded = 1;
547
548 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
549 * Unstable writes are a feature of certain networked
550 * filesystems (i.e. NFS) in which data may have been
551 * written to the server's write cache, but has not yet
552 * been flushed to permanent storage.
553 * Only move pages to writeback if this bdi is over its
554 * threshold otherwise wait until the disk writes catch
555 * up.
556 */
557 trace_wbc_balance_dirty_start(&wbc, bdi);
558 if (bdi_nr_reclaimable > bdi_thresh) {
559 writeback_inodes_wb(&bdi->wb, &wbc);
560 pages_written += write_chunk - wbc.nr_to_write;
561 trace_wbc_balance_dirty_written(&wbc, bdi);
562 if (pages_written >= write_chunk)
563 break; /* We've done our duty */
564 }
565 trace_wbc_balance_dirty_wait(&wbc, bdi);
568 __set_current_state(TASK_INTERRUPTIBLE); 566 __set_current_state(TASK_INTERRUPTIBLE);
569 io_schedule_timeout(pause); 567 io_schedule_timeout(pause);
570 568
@@ -577,8 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping,
577 pause = HZ / 10; 575 pause = HZ / 10;
578 } 576 }
579 577
580 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 578 if (!dirty_exceeded && bdi->dirty_exceeded)
581 bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 579 bdi->dirty_exceeded = 0;
583 580
584 if (writeback_in_progress(bdi)) 581 if (writeback_in_progress(bdi))
@@ -593,9 +590,7 @@ static void balance_dirty_pages(struct address_space *mapping,
593 * background_thresh, to keep the amount of dirty memory low. 590 * background_thresh, to keep the amount of dirty memory low.
594 */ 591 */
595 if ((laptop_mode && pages_written) || 592 if ((laptop_mode && pages_written) ||
596 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 593 (!laptop_mode && (nr_reclaimable > background_thresh)))
597 + global_page_state(NR_UNSTABLE_NFS))
598 > background_thresh)))
599 bdi_start_background_writeback(bdi); 594 bdi_start_background_writeback(bdi);
600} 595}
601 596
@@ -659,7 +654,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
659 unsigned long dirty_thresh; 654 unsigned long dirty_thresh;
660 655
661 for ( ; ; ) { 656 for ( ; ; ) {
662 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 657 global_dirty_limits(&background_thresh, &dirty_thresh);
663 658
664 /* 659 /*
665 * Boost the allowable dirty threshold a bit for page 660 * Boost the allowable dirty threshold a bit for page
@@ -805,6 +800,42 @@ void __init page_writeback_init(void)
805} 800}
806 801
807/** 802/**
803 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
804 * @mapping: address space structure to write
805 * @start: starting page index
806 * @end: ending page index (inclusive)
807 *
808 * This function scans the page range from @start to @end (inclusive) and tags
809 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
810 * that write_cache_pages (or whoever calls this function) will then use
811 * TOWRITE tag to identify pages eligible for writeback. This mechanism is
812 * used to avoid livelocking of writeback by a process steadily creating new
813 * dirty pages in the file (thus it is important for this function to be quick
814 * so that it can tag pages faster than a dirtying process can create them).
815 */
816/*
817 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
818 */
819void tag_pages_for_writeback(struct address_space *mapping,
820 pgoff_t start, pgoff_t end)
821{
822#define WRITEBACK_TAG_BATCH 4096
823 unsigned long tagged;
824
825 do {
826 spin_lock_irq(&mapping->tree_lock);
827 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
828 &start, end, WRITEBACK_TAG_BATCH,
829 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
830 spin_unlock_irq(&mapping->tree_lock);
831 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
832 cond_resched();
833 /* We check 'start' to handle wrapping when end == ~0UL */
834 } while (tagged >= WRITEBACK_TAG_BATCH && start);
835}
836EXPORT_SYMBOL(tag_pages_for_writeback);
837
838/**
808 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 839 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
809 * @mapping: address space structure to write 840 * @mapping: address space structure to write
810 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 841 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +849,13 @@ void __init page_writeback_init(void)
818 * the call was made get new I/O started against them. If wbc->sync_mode is 849 * the call was made get new I/O started against them. If wbc->sync_mode is
819 * WB_SYNC_ALL then we were called for data integrity and we must wait for 850 * WB_SYNC_ALL then we were called for data integrity and we must wait for
820 * existing IO to complete. 851 * existing IO to complete.
852 *
853 * To avoid livelocks (when other process dirties new pages), we first tag
854 * pages which should be written back with TOWRITE tag and only then start
855 * writing them. For data-integrity sync we have to be careful so that we do
856 * not miss some pages (e.g., because some other process has cleared TOWRITE
857 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
858 * by the process clearing the DIRTY tag (and submitting the page for IO).
821 */ 859 */
822int write_cache_pages(struct address_space *mapping, 860int write_cache_pages(struct address_space *mapping,
823 struct writeback_control *wbc, writepage_t writepage, 861 struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +871,7 @@ int write_cache_pages(struct address_space *mapping,
833 pgoff_t done_index; 871 pgoff_t done_index;
834 int cycled; 872 int cycled;
835 int range_whole = 0; 873 int range_whole = 0;
874 int tag;
836 875
837 pagevec_init(&pvec, 0); 876 pagevec_init(&pvec, 0);
838 if (wbc->range_cyclic) { 877 if (wbc->range_cyclic) {
@@ -849,29 +888,19 @@ int write_cache_pages(struct address_space *mapping,
849 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 888 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
850 range_whole = 1; 889 range_whole = 1;
851 cycled = 1; /* ignore range_cyclic tests */ 890 cycled = 1; /* ignore range_cyclic tests */
852
853 /*
854 * If this is a data integrity sync, cap the writeback to the
855 * current end of file. Any extension to the file that occurs
856 * after this is a new write and we don't need to write those
857 * pages out to fulfil our data integrity requirements. If we
858 * try to write them out, we can get stuck in this scan until
859 * the concurrent writer stops adding dirty pages and extending
860 * EOF.
861 */
862 if (wbc->sync_mode == WB_SYNC_ALL &&
863 wbc->range_end == LLONG_MAX) {
864 end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
865 }
866 } 891 }
867 892 if (wbc->sync_mode == WB_SYNC_ALL)
893 tag = PAGECACHE_TAG_TOWRITE;
894 else
895 tag = PAGECACHE_TAG_DIRTY;
868retry: 896retry:
897 if (wbc->sync_mode == WB_SYNC_ALL)
898 tag_pages_for_writeback(mapping, index, end);
869 done_index = index; 899 done_index = index;
870 while (!done && (index <= end)) { 900 while (!done && (index <= end)) {
871 int i; 901 int i;
872 902
873 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 903 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
874 PAGECACHE_TAG_DIRTY,
875 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 904 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
876 if (nr_pages == 0) 905 if (nr_pages == 0)
877 break; 906 break;
@@ -929,6 +958,7 @@ continue_unlock:
929 if (!clear_page_dirty_for_io(page)) 958 if (!clear_page_dirty_for_io(page))
930 goto continue_unlock; 959 goto continue_unlock;
931 960
961 trace_wbc_writepage(wbc, mapping->backing_dev_info);
932 ret = (*writepage)(page, wbc, data); 962 ret = (*writepage)(page, wbc, data);
933 if (unlikely(ret)) { 963 if (unlikely(ret)) {
934 if (ret == AOP_WRITEPAGE_ACTIVATE) { 964 if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -949,22 +979,16 @@ continue_unlock:
949 } 979 }
950 } 980 }
951 981
952 if (wbc->nr_to_write > 0) { 982 /*
953 if (--wbc->nr_to_write == 0 && 983 * We stop writing back only if we are not doing
954 wbc->sync_mode == WB_SYNC_NONE) { 984 * integrity sync. In case of integrity sync we have to
955 /* 985 * keep going until we have written all the pages
956 * We stop writing back only if we are 986 * we tagged for writeback prior to entering this loop.
957 * not doing integrity sync. In case of 987 */
958 * integrity sync we have to keep going 988 if (--wbc->nr_to_write <= 0 &&
959 * because someone may be concurrently 989 wbc->sync_mode == WB_SYNC_NONE) {
960 * dirtying pages, and we might have 990 done = 1;
961 * synced a lot of newly appeared dirty 991 break;
962 * pages, but have not synced all of the
963 * old dirty pages.
964 */
965 done = 1;
966 break;
967 }
968 } 992 }
969 } 993 }
970 pagevec_release(&pvec); 994 pagevec_release(&pvec);
@@ -1091,11 +1115,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1091{ 1115{
1092 if (mapping_cap_account_dirty(mapping)) { 1116 if (mapping_cap_account_dirty(mapping)) {
1093 __inc_zone_page_state(page, NR_FILE_DIRTY); 1117 __inc_zone_page_state(page, NR_FILE_DIRTY);
1118 __inc_zone_page_state(page, NR_DIRTIED);
1094 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1119 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1095 task_dirty_inc(current); 1120 task_dirty_inc(current);
1096 task_io_account_write(PAGE_CACHE_SIZE); 1121 task_io_account_write(PAGE_CACHE_SIZE);
1097 } 1122 }
1098} 1123}
1124EXPORT_SYMBOL(account_page_dirtied);
1125
1126/*
1127 * Helper function for set_page_writeback family.
1128 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
1129 * wrt interrupts.
1130 */
1131void account_page_writeback(struct page *page)
1132{
1133 inc_zone_page_state(page, NR_WRITEBACK);
1134 inc_zone_page_state(page, NR_WRITTEN);
1135}
1136EXPORT_SYMBOL(account_page_writeback);
1099 1137
1100/* 1138/*
1101 * For address_spaces which do not use buffers. Just tag the page as dirty in 1139 * For address_spaces which do not use buffers. Just tag the page as dirty in
@@ -1327,12 +1365,15 @@ int test_set_page_writeback(struct page *page)
1327 radix_tree_tag_clear(&mapping->page_tree, 1365 radix_tree_tag_clear(&mapping->page_tree,
1328 page_index(page), 1366 page_index(page),
1329 PAGECACHE_TAG_DIRTY); 1367 PAGECACHE_TAG_DIRTY);
1368 radix_tree_tag_clear(&mapping->page_tree,
1369 page_index(page),
1370 PAGECACHE_TAG_TOWRITE);
1330 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1371 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1331 } else { 1372 } else {
1332 ret = TestSetPageWriteback(page); 1373 ret = TestSetPageWriteback(page);
1333 } 1374 }
1334 if (!ret) 1375 if (!ret)
1335 inc_zone_page_state(page, NR_WRITEBACK); 1376 account_page_writeback(page);
1336 return ret; 1377 return ret;
1337 1378
1338} 1379}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339eb04c..07a654486f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/compiler.h> 25#include <linux/compiler.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
@@ -530,7 +531,7 @@ static inline void __free_one_page(struct page *page,
530 * so it's less likely to be used soon and more likely to be merged 531 * so it's less likely to be used soon and more likely to be merged
531 * as a higher order page 532 * as a higher order page
532 */ 533 */
533 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { 534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
534 struct page *higher_page, *higher_buddy; 535 struct page *higher_page, *higher_buddy;
535 combined_idx = __find_combined_index(page_idx, order); 536 combined_idx = __find_combined_index(page_idx, order);
536 higher_page = page + combined_idx - page_idx; 537 higher_page = page + combined_idx - page_idx;
@@ -588,13 +589,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
588{ 589{
589 int migratetype = 0; 590 int migratetype = 0;
590 int batch_free = 0; 591 int batch_free = 0;
592 int to_free = count;
591 593
592 spin_lock(&zone->lock); 594 spin_lock(&zone->lock);
593 zone->all_unreclaimable = 0; 595 zone->all_unreclaimable = 0;
594 zone->pages_scanned = 0; 596 zone->pages_scanned = 0;
595 597
596 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 598 while (to_free) {
597 while (count) {
598 struct page *page; 599 struct page *page;
599 struct list_head *list; 600 struct list_head *list;
600 601
@@ -619,8 +620,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
619 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 620 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
620 __free_one_page(page, zone, 0, page_private(page)); 621 __free_one_page(page, zone, 0, page_private(page));
621 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 622 trace_mm_page_pcpu_drain(page, 0, page_private(page));
622 } while (--count && --batch_free && !list_empty(list)); 623 } while (--to_free && --batch_free && !list_empty(list));
623 } 624 }
625 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
624 spin_unlock(&zone->lock); 626 spin_unlock(&zone->lock);
625} 627}
626 628
@@ -631,8 +633,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
631 zone->all_unreclaimable = 0; 633 zone->all_unreclaimable = 0;
632 zone->pages_scanned = 0; 634 zone->pages_scanned = 0;
633 635
634 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
635 __free_one_page(page, zone, order, migratetype); 636 __free_one_page(page, zone, order, migratetype);
637 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
636 spin_unlock(&zone->lock); 638 spin_unlock(&zone->lock);
637} 639}
638 640
@@ -1461,7 +1463,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1461{ 1463{
1462 /* free_pages my go negative - that's OK */ 1464 /* free_pages my go negative - that's OK */
1463 long min = mark; 1465 long min = mark;
1464 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1466 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1465 int o; 1467 int o;
1466 1468
1467 if (alloc_flags & ALLOC_HIGH) 1469 if (alloc_flags & ALLOC_HIGH)
@@ -1738,7 +1740,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1738 struct page *page; 1740 struct page *page;
1739 1741
1740 /* Acquire the OOM killer lock for the zones in zonelist */ 1742 /* Acquire the OOM killer lock for the zones in zonelist */
1741 if (!try_set_zone_oom(zonelist, gfp_mask)) { 1743 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
1742 schedule_timeout_uninterruptible(1); 1744 schedule_timeout_uninterruptible(1);
1743 return NULL; 1745 return NULL;
1744 } 1746 }
@@ -1759,6 +1761,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1759 /* The OOM killer will not help higher order allocs */ 1761 /* The OOM killer will not help higher order allocs */
1760 if (order > PAGE_ALLOC_COSTLY_ORDER) 1762 if (order > PAGE_ALLOC_COSTLY_ORDER)
1761 goto out; 1763 goto out;
1764 /* The OOM killer does not needlessly kill tasks for lowmem */
1765 if (high_zoneidx < ZONE_NORMAL)
1766 goto out;
1762 /* 1767 /*
1763 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 1768 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1764 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 1769 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -1843,6 +1848,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1843 struct page *page = NULL; 1848 struct page *page = NULL;
1844 struct reclaim_state reclaim_state; 1849 struct reclaim_state reclaim_state;
1845 struct task_struct *p = current; 1850 struct task_struct *p = current;
1851 bool drained = false;
1846 1852
1847 cond_resched(); 1853 cond_resched();
1848 1854
@@ -1861,14 +1867,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1861 1867
1862 cond_resched(); 1868 cond_resched();
1863 1869
1864 if (order != 0) 1870 if (unlikely(!(*did_some_progress)))
1865 drain_all_pages(); 1871 return NULL;
1866 1872
1867 if (likely(*did_some_progress)) 1873retry:
1868 page = get_page_from_freelist(gfp_mask, nodemask, order, 1874 page = get_page_from_freelist(gfp_mask, nodemask, order,
1869 zonelist, high_zoneidx, 1875 zonelist, high_zoneidx,
1870 alloc_flags, preferred_zone, 1876 alloc_flags, preferred_zone,
1871 migratetype); 1877 migratetype);
1878
1879 /*
1880 * If an allocation failed after direct reclaim, it could be because
1881 * pages are pinned on the per-cpu lists. Drain them and try again
1882 */
1883 if (!page && !drained) {
1884 drain_all_pages();
1885 drained = true;
1886 goto retry;
1887 }
1888
1872 return page; 1889 return page;
1873} 1890}
1874 1891
@@ -1890,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1890 preferred_zone, migratetype); 1907 preferred_zone, migratetype);
1891 1908
1892 if (!page && gfp_mask & __GFP_NOFAIL) 1909 if (!page && gfp_mask & __GFP_NOFAIL)
1893 congestion_wait(BLK_RW_ASYNC, HZ/50); 1910 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1894 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1911 } while (!page && (gfp_mask & __GFP_NOFAIL));
1895 1912
1896 return page; 1913 return page;
@@ -1915,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1915 const gfp_t wait = gfp_mask & __GFP_WAIT; 1932 const gfp_t wait = gfp_mask & __GFP_WAIT;
1916 1933
1917 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 1934 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1918 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); 1935 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
1919 1936
1920 /* 1937 /*
1921 * The caller may dip into page reserves a bit more if the caller 1938 * The caller may dip into page reserves a bit more if the caller
@@ -1923,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1923 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1940 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1924 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1941 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1925 */ 1942 */
1926 alloc_flags |= (gfp_mask & __GFP_HIGH); 1943 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1927 1944
1928 if (!wait) { 1945 if (!wait) {
1929 alloc_flags |= ALLOC_HARDER; 1946 alloc_flags |= ALLOC_HARDER;
@@ -2052,15 +2069,23 @@ rebalance:
2052 if (page) 2069 if (page)
2053 goto got_pg; 2070 goto got_pg;
2054 2071
2055 /* 2072 if (!(gfp_mask & __GFP_NOFAIL)) {
2056 * The OOM killer does not trigger for high-order 2073 /*
2057 * ~__GFP_NOFAIL allocations so if no progress is being 2074 * The oom killer is not called for high-order
2058 * made, there are no other options and retrying is 2075 * allocations that may fail, so if no progress
2059 * unlikely to help. 2076 * is being made, there are no other options and
2060 */ 2077 * retrying is unlikely to help.
2061 if (order > PAGE_ALLOC_COSTLY_ORDER && 2078 */
2062 !(gfp_mask & __GFP_NOFAIL)) 2079 if (order > PAGE_ALLOC_COSTLY_ORDER)
2063 goto nopage; 2080 goto nopage;
2081 /*
2082 * The oom killer is not called for lowmem
2083 * allocations to prevent needlessly killing
2084 * innocent tasks.
2085 */
2086 if (high_zoneidx < ZONE_NORMAL)
2087 goto nopage;
2088 }
2064 2089
2065 goto restart; 2090 goto restart;
2066 } 2091 }
@@ -2070,7 +2095,7 @@ rebalance:
2070 pages_reclaimed += did_some_progress; 2095 pages_reclaimed += did_some_progress;
2071 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2072 /* Wait for some write requests to complete then retry */ 2097 /* Wait for some write requests to complete then retry */
2073 congestion_wait(BLK_RW_ASYNC, HZ/50); 2098 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2074 goto rebalance; 2099 goto rebalance;
2075 } 2100 }
2076 2101
@@ -2412,7 +2437,7 @@ void show_free_areas(void)
2412 " all_unreclaimable? %s" 2437 " all_unreclaimable? %s"
2413 "\n", 2438 "\n",
2414 zone->name, 2439 zone->name,
2415 K(zone_page_state(zone, NR_FREE_PAGES)), 2440 K(zone_nr_free_pages(zone)),
2416 K(min_wmark_pages(zone)), 2441 K(min_wmark_pages(zone)),
2417 K(low_wmark_pages(zone)), 2442 K(low_wmark_pages(zone)),
2418 K(high_wmark_pages(zone)), 2443 K(high_wmark_pages(zone)),
@@ -3612,6 +3637,41 @@ void __init free_bootmem_with_active_regions(int nid,
3612 } 3637 }
3613} 3638}
3614 3639
3640#ifdef CONFIG_HAVE_MEMBLOCK
3641u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3642 u64 goal, u64 limit)
3643{
3644 int i;
3645
3646 /* Need to go over early_node_map to find out good range for node */
3647 for_each_active_range_index_in_nid(i, nid) {
3648 u64 addr;
3649 u64 ei_start, ei_last;
3650 u64 final_start, final_end;
3651
3652 ei_last = early_node_map[i].end_pfn;
3653 ei_last <<= PAGE_SHIFT;
3654 ei_start = early_node_map[i].start_pfn;
3655 ei_start <<= PAGE_SHIFT;
3656
3657 final_start = max(ei_start, goal);
3658 final_end = min(ei_last, limit);
3659
3660 if (final_start >= final_end)
3661 continue;
3662
3663 addr = memblock_find_in_range(final_start, final_end, size, align);
3664
3665 if (addr == MEMBLOCK_ERROR)
3666 continue;
3667
3668 return addr;
3669 }
3670
3671 return MEMBLOCK_ERROR;
3672}
3673#endif
3674
3615int __init add_from_early_node_map(struct range *range, int az, 3675int __init add_from_early_node_map(struct range *range, int az,
3616 int nr_range, int nid) 3676 int nr_range, int nid)
3617{ 3677{
@@ -3631,46 +3691,26 @@ int __init add_from_early_node_map(struct range *range, int az,
3631void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, 3691void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3632 u64 goal, u64 limit) 3692 u64 goal, u64 limit)
3633{ 3693{
3634 int i;
3635 void *ptr; 3694 void *ptr;
3695 u64 addr;
3636 3696
3637 if (limit > get_max_mapped()) 3697 if (limit > memblock.current_limit)
3638 limit = get_max_mapped(); 3698 limit = memblock.current_limit;
3639 3699
3640 /* need to go over early_node_map to find out good range for node */ 3700 addr = find_memory_core_early(nid, size, align, goal, limit);
3641 for_each_active_range_index_in_nid(i, nid) {
3642 u64 addr;
3643 u64 ei_start, ei_last;
3644 3701
3645 ei_last = early_node_map[i].end_pfn; 3702 if (addr == MEMBLOCK_ERROR)
3646 ei_last <<= PAGE_SHIFT; 3703 return NULL;
3647 ei_start = early_node_map[i].start_pfn;
3648 ei_start <<= PAGE_SHIFT;
3649 addr = find_early_area(ei_start, ei_last,
3650 goal, limit, size, align);
3651
3652 if (addr == -1ULL)
3653 continue;
3654
3655#if 0
3656 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3657 nid,
3658 ei_start, ei_last, goal, limit, size,
3659 align, addr);
3660#endif
3661
3662 ptr = phys_to_virt(addr);
3663 memset(ptr, 0, size);
3664 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3665 /*
3666 * The min_count is set to 0 so that bootmem allocated blocks
3667 * are never reported as leaks.
3668 */
3669 kmemleak_alloc(ptr, size, 0, 0);
3670 return ptr;
3671 }
3672 3704
3673 return NULL; 3705 ptr = phys_to_virt(addr);
3706 memset(ptr, 0, size);
3707 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
3708 /*
3709 * The min_count is set to 0 so that bootmem allocated blocks
3710 * are never reported as leaks.
3711 */
3712 kmemleak_alloc(ptr, size, 0, 0);
3713 return ptr;
3674} 3714}
3675#endif 3715#endif
3676 3716
@@ -4089,8 +4129,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4089 zone_seqlock_init(zone); 4129 zone_seqlock_init(zone);
4090 zone->zone_pgdat = pgdat; 4130 zone->zone_pgdat = pgdat;
4091 4131
4092 zone->prev_priority = DEF_PRIORITY;
4093
4094 zone_pcp_init(zone); 4132 zone_pcp_init(zone);
4095 for_each_lru(l) { 4133 for_each_lru(l) {
4096 INIT_LIST_HEAD(&zone->lru[l].list); 4134 INIT_LIST_HEAD(&zone->lru[l].list);
@@ -5160,9 +5198,9 @@ void *__init alloc_large_system_hash(const char *tablename,
5160 if (!table) 5198 if (!table)
5161 panic("Failed to allocate %s hash table\n", tablename); 5199 panic("Failed to allocate %s hash table\n", tablename);
5162 5200
5163 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", 5201 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
5164 tablename, 5202 tablename,
5165 (1U << log2qty), 5203 (1UL << log2qty),
5166 ilog2(size) - PAGE_SHIFT, 5204 ilog2(size) - PAGE_SHIFT,
5167 size); 5205 size);
5168 5206
@@ -5259,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5259 * page allocater never alloc memory from ISOLATE block. 5297 * page allocater never alloc memory from ISOLATE block.
5260 */ 5298 */
5261 5299
5300static int
5301__count_immobile_pages(struct zone *zone, struct page *page, int count)
5302{
5303 unsigned long pfn, iter, found;
5304 /*
5305 * For avoiding noise data, lru_add_drain_all() should be called
5306 * If ZONE_MOVABLE, the zone never contains immobile pages
5307 */
5308 if (zone_idx(zone) == ZONE_MOVABLE)
5309 return true;
5310
5311 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
5312 return true;
5313
5314 pfn = page_to_pfn(page);
5315 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5316 unsigned long check = pfn + iter;
5317
5318 if (!pfn_valid_within(check)) {
5319 iter++;
5320 continue;
5321 }
5322 page = pfn_to_page(check);
5323 if (!page_count(page)) {
5324 if (PageBuddy(page))
5325 iter += (1 << page_order(page)) - 1;
5326 continue;
5327 }
5328 if (!PageLRU(page))
5329 found++;
5330 /*
5331 * If there are RECLAIMABLE pages, we need to check it.
5332 * But now, memory offline itself doesn't call shrink_slab()
5333 * and it still to be fixed.
5334 */
5335 /*
5336 * If the page is not RAM, page_count()should be 0.
5337 * we don't need more check. This is an _used_ not-movable page.
5338 *
5339 * The problematic thing here is PG_reserved pages. PG_reserved
5340 * is set to both of a memory hole page and a _used_ kernel
5341 * page at boot.
5342 */
5343 if (found > count)
5344 return false;
5345 }
5346 return true;
5347}
5348
5349bool is_pageblock_removable_nolock(struct page *page)
5350{
5351 struct zone *zone = page_zone(page);
5352 return __count_immobile_pages(zone, page, 0);
5353}
5354
5262int set_migratetype_isolate(struct page *page) 5355int set_migratetype_isolate(struct page *page)
5263{ 5356{
5264 struct zone *zone; 5357 struct zone *zone;
5265 struct page *curr_page; 5358 unsigned long flags, pfn;
5266 unsigned long flags, pfn, iter;
5267 unsigned long immobile = 0;
5268 struct memory_isolate_notify arg; 5359 struct memory_isolate_notify arg;
5269 int notifier_ret; 5360 int notifier_ret;
5270 int ret = -EBUSY; 5361 int ret = -EBUSY;
@@ -5274,11 +5365,6 @@ int set_migratetype_isolate(struct page *page)
5274 zone_idx = zone_idx(zone); 5365 zone_idx = zone_idx(zone);
5275 5366
5276 spin_lock_irqsave(&zone->lock, flags); 5367 spin_lock_irqsave(&zone->lock, flags);
5277 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5278 zone_idx == ZONE_MOVABLE) {
5279 ret = 0;
5280 goto out;
5281 }
5282 5368
5283 pfn = page_to_pfn(page); 5369 pfn = page_to_pfn(page);
5284 arg.start_pfn = pfn; 5370 arg.start_pfn = pfn;
@@ -5298,23 +5384,20 @@ int set_migratetype_isolate(struct page *page)
5298 */ 5384 */
5299 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); 5385 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5300 notifier_ret = notifier_to_errno(notifier_ret); 5386 notifier_ret = notifier_to_errno(notifier_ret);
5301 if (notifier_ret || !arg.pages_found) 5387 if (notifier_ret)
5302 goto out; 5388 goto out;
5303 5389 /*
5304 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { 5390 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5305 if (!pfn_valid_within(pfn)) 5391 * We just check MOVABLE pages.
5306 continue; 5392 */
5307 5393 if (__count_immobile_pages(zone, page, arg.pages_found))
5308 curr_page = pfn_to_page(iter);
5309 if (!page_count(curr_page) || PageLRU(curr_page))
5310 continue;
5311
5312 immobile++;
5313 }
5314
5315 if (arg.pages_found == immobile)
5316 ret = 0; 5394 ret = 0;
5317 5395
5396 /*
5397 * immobile means "not-on-lru" paes. If immobile is larger than
5398 * removable-by-driver pages reported by notifier, we'll fail.
5399 */
5400
5318out: 5401out:
5319 if (!ret) { 5402 if (!ret) {
5320 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5403 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
diff --git a/mm/page_io.c b/mm/page_io.c
index 31a3b962230..2dee975bf46 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
106 goto out; 106 goto out;
107 } 107 }
108 if (wbc->sync_mode == WB_SYNC_ALL) 108 if (wbc->sync_mode == WB_SYNC_ALL)
109 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 109 rw |= REQ_SYNC | REQ_UNPLUG;
110 count_vm_event(PSWPOUT); 110 count_vm_event(PSWPOUT);
111 set_page_writeback(page); 111 set_page_writeback(page);
112 unlock_page(page); 112 unlock_page(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd96745..4ae42bb4089 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
86 * all pages in [start_pfn...end_pfn) must be in the same zone. 86 * all pages in [start_pfn...end_pfn) must be in the same zone.
87 * zone->lock must be held before call this. 87 * zone->lock must be held before call this.
88 * 88 *
89 * Returns 0 if all pages in the range is isolated. 89 * Returns 1 if all pages in the range is isolated.
90 */ 90 */
91static int 91static int
92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
119 struct zone *zone; 119 struct zone *zone;
120 int ret; 120 int ret;
121 121
122 pfn = start_pfn;
123 /* 122 /*
124 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page 123 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
125 * is not aligned to pageblock_nr_pages. 124 * is not aligned to pageblock_nr_pages.
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index df680855540..89633fefc6a 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -27,7 +27,7 @@
27 * chunk size is not aligned. percpu-km code will whine about it. 27 * chunk size is not aligned. percpu-km code will whine about it.
28 */ 28 */
29 29
30#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 30#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
31#error "contiguous percpu allocation is incompatible with paged first chunk" 31#error "contiguous percpu allocation is incompatible with paged first chunk"
32#endif 32#endif
33 33
@@ -35,7 +35,11 @@
35 35
36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
37{ 37{
38 /* noop */ 38 unsigned int cpu;
39
40 for_each_possible_cpu(cpu)
41 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
42
39 return 0; 43 return 0;
40} 44}
41 45
diff --git a/mm/percpu.c b/mm/percpu.c
index 6470e771023..efe816856a9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -31,7 +31,7 @@
31 * as small as 4 bytes. The allocator organizes chunks into lists 31 * as small as 4 bytes. The allocator organizes chunks into lists
32 * according to free size and tries to allocate from the fullest one. 32 * according to free size and tries to allocate from the fullest one.
33 * Each chunk keeps the maximum contiguous area size hint which is 33 * Each chunk keeps the maximum contiguous area size hint which is
34 * guaranteed to be eqaul to or larger than the maximum contiguous 34 * guaranteed to be equal to or larger than the maximum contiguous
35 * area in the chunk. This helps the allocator not to iterate the 35 * area in the chunk. This helps the allocator not to iterate the
36 * chunk maps unnecessarily. 36 * chunk maps unnecessarily.
37 * 37 *
@@ -76,6 +76,7 @@
76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
78 78
79#ifdef CONFIG_SMP
79/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
80#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
81#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
@@ -89,6 +90,11 @@
89 (unsigned long)pcpu_base_addr - \ 90 (unsigned long)pcpu_base_addr - \
90 (unsigned long)__per_cpu_start) 91 (unsigned long)__per_cpu_start)
91#endif 92#endif
93#else /* CONFIG_SMP */
94/* on UP, it's always identity mapped */
95#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
96#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
97#endif /* CONFIG_SMP */
92 98
93struct pcpu_chunk { 99struct pcpu_chunk {
94 struct list_head list; /* linked to pcpu_slot lists */ 100 struct list_head list; /* linked to pcpu_slot lists */
@@ -282,6 +288,9 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
282 */ 288 */
283static void *pcpu_mem_alloc(size_t size) 289static void *pcpu_mem_alloc(size_t size)
284{ 290{
291 if (WARN_ON_ONCE(!slab_is_available()))
292 return NULL;
293
285 if (size <= PAGE_SIZE) 294 if (size <= PAGE_SIZE)
286 return kzalloc(size, GFP_KERNEL); 295 return kzalloc(size, GFP_KERNEL);
287 else { 296 else {
@@ -390,14 +399,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
390 goto out_unlock; 399 goto out_unlock;
391 400
392 old_size = chunk->map_alloc * sizeof(chunk->map[0]); 401 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
393 memcpy(new, chunk->map, old_size); 402 old = chunk->map;
394 403
395 /* 404 memcpy(new, old, old_size);
396 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
397 * one of the first chunks and still using static map.
398 */
399 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
400 old = chunk->map;
401 405
402 chunk->map_alloc = new_alloc; 406 chunk->map_alloc = new_alloc;
403 chunk->map = new; 407 chunk->map = new;
@@ -604,7 +608,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
604{ 608{
605 struct pcpu_chunk *chunk; 609 struct pcpu_chunk *chunk;
606 610
607 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); 611 chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
608 if (!chunk) 612 if (!chunk)
609 return NULL; 613 return NULL;
610 614
@@ -822,8 +826,8 @@ fail_unlock_mutex:
822 * @size: size of area to allocate in bytes 826 * @size: size of area to allocate in bytes
823 * @align: alignment of area (max PAGE_SIZE) 827 * @align: alignment of area (max PAGE_SIZE)
824 * 828 *
825 * Allocate percpu area of @size bytes aligned at @align. Might 829 * Allocate zero-filled percpu area of @size bytes aligned at @align.
826 * sleep. Might trigger writeouts. 830 * Might sleep. Might trigger writeouts.
827 * 831 *
828 * CONTEXT: 832 * CONTEXT:
829 * Does GFP_KERNEL allocation. 833 * Does GFP_KERNEL allocation.
@@ -842,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
842 * @size: size of area to allocate in bytes 846 * @size: size of area to allocate in bytes
843 * @align: alignment of area (max PAGE_SIZE) 847 * @align: alignment of area (max PAGE_SIZE)
844 * 848 *
845 * Allocate percpu area of @size bytes aligned at @align from reserved 849 * Allocate zero-filled percpu area of @size bytes aligned at @align
846 * percpu area if arch has set it up; otherwise, allocation is served 850 * from reserved percpu area if arch has set it up; otherwise,
847 * from the same dynamic area. Might sleep. Might trigger writeouts. 851 * allocation is served from the same dynamic area. Might sleep.
852 * Might trigger writeouts.
848 * 853 *
849 * CONTEXT: 854 * CONTEXT:
850 * Does GFP_KERNEL allocation. 855 * Does GFP_KERNEL allocation.
@@ -951,6 +956,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
951 */ 956 */
952bool is_kernel_percpu_address(unsigned long addr) 957bool is_kernel_percpu_address(unsigned long addr)
953{ 958{
959#ifdef CONFIG_SMP
954 const size_t static_size = __per_cpu_end - __per_cpu_start; 960 const size_t static_size = __per_cpu_end - __per_cpu_start;
955 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); 961 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
956 unsigned int cpu; 962 unsigned int cpu;
@@ -961,6 +967,8 @@ bool is_kernel_percpu_address(unsigned long addr)
961 if ((void *)addr >= start && (void *)addr < start + static_size) 967 if ((void *)addr >= start && (void *)addr < start + static_size)
962 return true; 968 return true;
963 } 969 }
970#endif
971 /* on UP, can't distinguish from other static vars, always false */
964 return false; 972 return false;
965} 973}
966 974
@@ -1013,20 +1021,6 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1013 return page_to_phys(pcpu_addr_to_page(addr)); 1021 return page_to_phys(pcpu_addr_to_page(addr));
1014} 1022}
1015 1023
1016static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1017 size_t reserved_size,
1018 ssize_t *dyn_sizep)
1019{
1020 size_t size_sum;
1021
1022 size_sum = PFN_ALIGN(static_size + reserved_size +
1023 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1024 if (*dyn_sizep != 0)
1025 *dyn_sizep = size_sum - static_size - reserved_size;
1026
1027 return size_sum;
1028}
1029
1030/** 1024/**
1031 * pcpu_alloc_alloc_info - allocate percpu allocation info 1025 * pcpu_alloc_alloc_info - allocate percpu allocation info
1032 * @nr_groups: the number of groups 1026 * @nr_groups: the number of groups
@@ -1083,157 +1077,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1083} 1077}
1084 1078
1085/** 1079/**
1086 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1087 * @reserved_size: the size of reserved percpu area in bytes
1088 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1089 * @atom_size: allocation atom size
1090 * @cpu_distance_fn: callback to determine distance between cpus, optional
1091 *
1092 * This function determines grouping of units, their mappings to cpus
1093 * and other parameters considering needed percpu size, allocation
1094 * atom size and distances between CPUs.
1095 *
1096 * Groups are always mutliples of atom size and CPUs which are of
1097 * LOCAL_DISTANCE both ways are grouped together and share space for
1098 * units in the same group. The returned configuration is guaranteed
1099 * to have CPUs on different nodes on different groups and >=75% usage
1100 * of allocated virtual address space.
1101 *
1102 * RETURNS:
1103 * On success, pointer to the new allocation_info is returned. On
1104 * failure, ERR_PTR value is returned.
1105 */
1106struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1107 size_t reserved_size, ssize_t dyn_size,
1108 size_t atom_size,
1109 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1110{
1111 static int group_map[NR_CPUS] __initdata;
1112 static int group_cnt[NR_CPUS] __initdata;
1113 const size_t static_size = __per_cpu_end - __per_cpu_start;
1114 int nr_groups = 1, nr_units = 0;
1115 size_t size_sum, min_unit_size, alloc_size;
1116 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1117 int last_allocs, group, unit;
1118 unsigned int cpu, tcpu;
1119 struct pcpu_alloc_info *ai;
1120 unsigned int *cpu_map;
1121
1122 /* this function may be called multiple times */
1123 memset(group_map, 0, sizeof(group_map));
1124 memset(group_cnt, 0, sizeof(group_cnt));
1125
1126 /*
1127 * Determine min_unit_size, alloc_size and max_upa such that
1128 * alloc_size is multiple of atom_size and is the smallest
1129 * which can accomodate 4k aligned segments which are equal to
1130 * or larger than min_unit_size.
1131 */
1132 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1133 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1134
1135 alloc_size = roundup(min_unit_size, atom_size);
1136 upa = alloc_size / min_unit_size;
1137 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1138 upa--;
1139 max_upa = upa;
1140
1141 /* group cpus according to their proximity */
1142 for_each_possible_cpu(cpu) {
1143 group = 0;
1144 next_group:
1145 for_each_possible_cpu(tcpu) {
1146 if (cpu == tcpu)
1147 break;
1148 if (group_map[tcpu] == group && cpu_distance_fn &&
1149 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1150 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1151 group++;
1152 nr_groups = max(nr_groups, group + 1);
1153 goto next_group;
1154 }
1155 }
1156 group_map[cpu] = group;
1157 group_cnt[group]++;
1158 }
1159
1160 /*
1161 * Expand unit size until address space usage goes over 75%
1162 * and then as much as possible without using more address
1163 * space.
1164 */
1165 last_allocs = INT_MAX;
1166 for (upa = max_upa; upa; upa--) {
1167 int allocs = 0, wasted = 0;
1168
1169 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1170 continue;
1171
1172 for (group = 0; group < nr_groups; group++) {
1173 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1174 allocs += this_allocs;
1175 wasted += this_allocs * upa - group_cnt[group];
1176 }
1177
1178 /*
1179 * Don't accept if wastage is over 25%. The
1180 * greater-than comparison ensures upa==1 always
1181 * passes the following check.
1182 */
1183 if (wasted > num_possible_cpus() / 3)
1184 continue;
1185
1186 /* and then don't consume more memory */
1187 if (allocs > last_allocs)
1188 break;
1189 last_allocs = allocs;
1190 best_upa = upa;
1191 }
1192 upa = best_upa;
1193
1194 /* allocate and fill alloc_info */
1195 for (group = 0; group < nr_groups; group++)
1196 nr_units += roundup(group_cnt[group], upa);
1197
1198 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1199 if (!ai)
1200 return ERR_PTR(-ENOMEM);
1201 cpu_map = ai->groups[0].cpu_map;
1202
1203 for (group = 0; group < nr_groups; group++) {
1204 ai->groups[group].cpu_map = cpu_map;
1205 cpu_map += roundup(group_cnt[group], upa);
1206 }
1207
1208 ai->static_size = static_size;
1209 ai->reserved_size = reserved_size;
1210 ai->dyn_size = dyn_size;
1211 ai->unit_size = alloc_size / upa;
1212 ai->atom_size = atom_size;
1213 ai->alloc_size = alloc_size;
1214
1215 for (group = 0, unit = 0; group_cnt[group]; group++) {
1216 struct pcpu_group_info *gi = &ai->groups[group];
1217
1218 /*
1219 * Initialize base_offset as if all groups are located
1220 * back-to-back. The caller should update this to
1221 * reflect actual allocation.
1222 */
1223 gi->base_offset = unit * ai->unit_size;
1224
1225 for_each_possible_cpu(cpu)
1226 if (group_map[cpu] == group)
1227 gi->cpu_map[gi->nr_units++] = cpu;
1228 gi->nr_units = roundup(gi->nr_units, upa);
1229 unit += gi->nr_units;
1230 }
1231 BUG_ON(unit != nr_units);
1232
1233 return ai;
1234}
1235
1236/**
1237 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info 1080 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1238 * @lvl: loglevel 1081 * @lvl: loglevel
1239 * @ai: allocation info to dump 1082 * @ai: allocation info to dump
@@ -1350,7 +1193,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1350 void *base_addr) 1193 void *base_addr)
1351{ 1194{
1352 static char cpus_buf[4096] __initdata; 1195 static char cpus_buf[4096] __initdata;
1353 static int smap[2], dmap[2]; 1196 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1197 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1354 size_t dyn_size = ai->dyn_size; 1198 size_t dyn_size = ai->dyn_size;
1355 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; 1199 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1356 struct pcpu_chunk *schunk, *dchunk = NULL; 1200 struct pcpu_chunk *schunk, *dchunk = NULL;
@@ -1373,14 +1217,15 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1373} while (0) 1217} while (0)
1374 1218
1375 /* sanity checks */ 1219 /* sanity checks */
1376 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1377 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1378 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); 1220 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1221#ifdef CONFIG_SMP
1379 PCPU_SETUP_BUG_ON(!ai->static_size); 1222 PCPU_SETUP_BUG_ON(!ai->static_size);
1223#endif
1380 PCPU_SETUP_BUG_ON(!base_addr); 1224 PCPU_SETUP_BUG_ON(!base_addr);
1381 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1225 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1382 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1226 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1383 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1227 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1228 PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
1384 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); 1229 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1385 1230
1386 /* process group information and build config tables accordingly */ 1231 /* process group information and build config tables accordingly */
@@ -1413,9 +1258,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1413 1258
1414 if (pcpu_first_unit_cpu == NR_CPUS) 1259 if (pcpu_first_unit_cpu == NR_CPUS)
1415 pcpu_first_unit_cpu = cpu; 1260 pcpu_first_unit_cpu = cpu;
1261 pcpu_last_unit_cpu = cpu;
1416 } 1262 }
1417 } 1263 }
1418 pcpu_last_unit_cpu = cpu;
1419 pcpu_nr_units = unit; 1264 pcpu_nr_units = unit;
1420 1265
1421 for_each_possible_cpu(cpu) 1266 for_each_possible_cpu(cpu)
@@ -1500,6 +1345,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1500 return 0; 1345 return 0;
1501} 1346}
1502 1347
1348#ifdef CONFIG_SMP
1349
1503const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { 1350const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1504 [PCPU_FC_AUTO] = "auto", 1351 [PCPU_FC_AUTO] = "auto",
1505 [PCPU_FC_EMBED] = "embed", 1352 [PCPU_FC_EMBED] = "embed",
@@ -1527,12 +1374,184 @@ static int __init percpu_alloc_setup(char *str)
1527} 1374}
1528early_param("percpu_alloc", percpu_alloc_setup); 1375early_param("percpu_alloc", percpu_alloc_setup);
1529 1376
1377/*
1378 * pcpu_embed_first_chunk() is used by the generic percpu setup.
1379 * Build it if needed by the arch config or the generic setup is going
1380 * to be used.
1381 */
1530#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ 1382#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1531 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) 1383 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1384#define BUILD_EMBED_FIRST_CHUNK
1385#endif
1386
1387/* build pcpu_page_first_chunk() iff needed by the arch config */
1388#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1389#define BUILD_PAGE_FIRST_CHUNK
1390#endif
1391
1392/* pcpu_build_alloc_info() is used by both embed and page first chunk */
1393#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1394/**
1395 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1396 * @reserved_size: the size of reserved percpu area in bytes
1397 * @dyn_size: minimum free size for dynamic allocation in bytes
1398 * @atom_size: allocation atom size
1399 * @cpu_distance_fn: callback to determine distance between cpus, optional
1400 *
1401 * This function determines grouping of units, their mappings to cpus
1402 * and other parameters considering needed percpu size, allocation
1403 * atom size and distances between CPUs.
1404 *
1405 * Groups are always mutliples of atom size and CPUs which are of
1406 * LOCAL_DISTANCE both ways are grouped together and share space for
1407 * units in the same group. The returned configuration is guaranteed
1408 * to have CPUs on different nodes on different groups and >=75% usage
1409 * of allocated virtual address space.
1410 *
1411 * RETURNS:
1412 * On success, pointer to the new allocation_info is returned. On
1413 * failure, ERR_PTR value is returned.
1414 */
1415static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1416 size_t reserved_size, size_t dyn_size,
1417 size_t atom_size,
1418 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1419{
1420 static int group_map[NR_CPUS] __initdata;
1421 static int group_cnt[NR_CPUS] __initdata;
1422 const size_t static_size = __per_cpu_end - __per_cpu_start;
1423 int nr_groups = 1, nr_units = 0;
1424 size_t size_sum, min_unit_size, alloc_size;
1425 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1426 int last_allocs, group, unit;
1427 unsigned int cpu, tcpu;
1428 struct pcpu_alloc_info *ai;
1429 unsigned int *cpu_map;
1430
1431 /* this function may be called multiple times */
1432 memset(group_map, 0, sizeof(group_map));
1433 memset(group_cnt, 0, sizeof(group_cnt));
1434
1435 /* calculate size_sum and ensure dyn_size is enough for early alloc */
1436 size_sum = PFN_ALIGN(static_size + reserved_size +
1437 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1438 dyn_size = size_sum - static_size - reserved_size;
1439
1440 /*
1441 * Determine min_unit_size, alloc_size and max_upa such that
1442 * alloc_size is multiple of atom_size and is the smallest
1443 * which can accomodate 4k aligned segments which are equal to
1444 * or larger than min_unit_size.
1445 */
1446 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1447
1448 alloc_size = roundup(min_unit_size, atom_size);
1449 upa = alloc_size / min_unit_size;
1450 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1451 upa--;
1452 max_upa = upa;
1453
1454 /* group cpus according to their proximity */
1455 for_each_possible_cpu(cpu) {
1456 group = 0;
1457 next_group:
1458 for_each_possible_cpu(tcpu) {
1459 if (cpu == tcpu)
1460 break;
1461 if (group_map[tcpu] == group && cpu_distance_fn &&
1462 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1463 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1464 group++;
1465 nr_groups = max(nr_groups, group + 1);
1466 goto next_group;
1467 }
1468 }
1469 group_map[cpu] = group;
1470 group_cnt[group]++;
1471 }
1472
1473 /*
1474 * Expand unit size until address space usage goes over 75%
1475 * and then as much as possible without using more address
1476 * space.
1477 */
1478 last_allocs = INT_MAX;
1479 for (upa = max_upa; upa; upa--) {
1480 int allocs = 0, wasted = 0;
1481
1482 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1483 continue;
1484
1485 for (group = 0; group < nr_groups; group++) {
1486 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1487 allocs += this_allocs;
1488 wasted += this_allocs * upa - group_cnt[group];
1489 }
1490
1491 /*
1492 * Don't accept if wastage is over 1/3. The
1493 * greater-than comparison ensures upa==1 always
1494 * passes the following check.
1495 */
1496 if (wasted > num_possible_cpus() / 3)
1497 continue;
1498
1499 /* and then don't consume more memory */
1500 if (allocs > last_allocs)
1501 break;
1502 last_allocs = allocs;
1503 best_upa = upa;
1504 }
1505 upa = best_upa;
1506
1507 /* allocate and fill alloc_info */
1508 for (group = 0; group < nr_groups; group++)
1509 nr_units += roundup(group_cnt[group], upa);
1510
1511 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1512 if (!ai)
1513 return ERR_PTR(-ENOMEM);
1514 cpu_map = ai->groups[0].cpu_map;
1515
1516 for (group = 0; group < nr_groups; group++) {
1517 ai->groups[group].cpu_map = cpu_map;
1518 cpu_map += roundup(group_cnt[group], upa);
1519 }
1520
1521 ai->static_size = static_size;
1522 ai->reserved_size = reserved_size;
1523 ai->dyn_size = dyn_size;
1524 ai->unit_size = alloc_size / upa;
1525 ai->atom_size = atom_size;
1526 ai->alloc_size = alloc_size;
1527
1528 for (group = 0, unit = 0; group_cnt[group]; group++) {
1529 struct pcpu_group_info *gi = &ai->groups[group];
1530
1531 /*
1532 * Initialize base_offset as if all groups are located
1533 * back-to-back. The caller should update this to
1534 * reflect actual allocation.
1535 */
1536 gi->base_offset = unit * ai->unit_size;
1537
1538 for_each_possible_cpu(cpu)
1539 if (group_map[cpu] == group)
1540 gi->cpu_map[gi->nr_units++] = cpu;
1541 gi->nr_units = roundup(gi->nr_units, upa);
1542 unit += gi->nr_units;
1543 }
1544 BUG_ON(unit != nr_units);
1545
1546 return ai;
1547}
1548#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
1549
1550#if defined(BUILD_EMBED_FIRST_CHUNK)
1532/** 1551/**
1533 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1552 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1534 * @reserved_size: the size of reserved percpu area in bytes 1553 * @reserved_size: the size of reserved percpu area in bytes
1535 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1554 * @dyn_size: minimum free size for dynamic allocation in bytes
1536 * @atom_size: allocation atom size 1555 * @atom_size: allocation atom size
1537 * @cpu_distance_fn: callback to determine distance between cpus, optional 1556 * @cpu_distance_fn: callback to determine distance between cpus, optional
1538 * @alloc_fn: function to allocate percpu page 1557 * @alloc_fn: function to allocate percpu page
@@ -1553,10 +1572,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
1553 * vmalloc space is not orders of magnitude larger than distances 1572 * vmalloc space is not orders of magnitude larger than distances
1554 * between node memory addresses (ie. 32bit NUMA machines). 1573 * between node memory addresses (ie. 32bit NUMA machines).
1555 * 1574 *
1556 * When @dyn_size is positive, dynamic area might be larger than 1575 * @dyn_size specifies the minimum dynamic area size.
1557 * specified to fill page alignment. When @dyn_size is auto,
1558 * @dyn_size is just big enough to fill page alignment after static
1559 * and reserved areas.
1560 * 1576 *
1561 * If the needed size is smaller than the minimum or specified unit 1577 * If the needed size is smaller than the minimum or specified unit
1562 * size, the leftover is returned using @free_fn. 1578 * size, the leftover is returned using @free_fn.
@@ -1564,7 +1580,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
1564 * RETURNS: 1580 * RETURNS:
1565 * 0 on success, -errno on failure. 1581 * 0 on success, -errno on failure.
1566 */ 1582 */
1567int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, 1583int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1568 size_t atom_size, 1584 size_t atom_size,
1569 pcpu_fc_cpu_distance_fn_t cpu_distance_fn, 1585 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1570 pcpu_fc_alloc_fn_t alloc_fn, 1586 pcpu_fc_alloc_fn_t alloc_fn,
@@ -1660,10 +1676,9 @@ out_free:
1660 free_bootmem(__pa(areas), areas_size); 1676 free_bootmem(__pa(areas), areas_size);
1661 return rc; 1677 return rc;
1662} 1678}
1663#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || 1679#endif /* BUILD_EMBED_FIRST_CHUNK */
1664 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1665 1680
1666#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1681#ifdef BUILD_PAGE_FIRST_CHUNK
1667/** 1682/**
1668 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages 1683 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1669 * @reserved_size: the size of reserved percpu area in bytes 1684 * @reserved_size: the size of reserved percpu area in bytes
@@ -1695,7 +1710,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
1695 1710
1696 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); 1711 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
1697 1712
1698 ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); 1713 ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
1699 if (IS_ERR(ai)) 1714 if (IS_ERR(ai))
1700 return PTR_ERR(ai); 1715 return PTR_ERR(ai);
1701 BUG_ON(ai->nr_groups != 1); 1716 BUG_ON(ai->nr_groups != 1);
@@ -1771,10 +1786,11 @@ out_free_ar:
1771 pcpu_free_alloc_info(ai); 1786 pcpu_free_alloc_info(ai);
1772 return rc; 1787 return rc;
1773} 1788}
1774#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ 1789#endif /* BUILD_PAGE_FIRST_CHUNK */
1775 1790
1791#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1776/* 1792/*
1777 * Generic percpu area setup. 1793 * Generic SMP percpu area setup.
1778 * 1794 *
1779 * The embedding helper is used because its behavior closely resembles 1795 * The embedding helper is used because its behavior closely resembles
1780 * the original non-dynamic generic percpu area setup. This is 1796 * the original non-dynamic generic percpu area setup. This is
@@ -1785,7 +1801,6 @@ out_free_ar:
1785 * on the physical linear memory mapping which uses large page 1801 * on the physical linear memory mapping which uses large page
1786 * mappings on applicable archs. 1802 * mappings on applicable archs.
1787 */ 1803 */
1788#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1789unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 1804unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
1790EXPORT_SYMBOL(__per_cpu_offset); 1805EXPORT_SYMBOL(__per_cpu_offset);
1791 1806
@@ -1814,10 +1829,75 @@ void __init setup_per_cpu_areas(void)
1814 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, 1829 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
1815 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); 1830 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
1816 if (rc < 0) 1831 if (rc < 0)
1817 panic("Failed to initialized percpu areas."); 1832 panic("Failed to initialize percpu areas.");
1818 1833
1819 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 1834 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1820 for_each_possible_cpu(cpu) 1835 for_each_possible_cpu(cpu)
1821 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 1836 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1822} 1837}
1823#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ 1838#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
1839
1840#else /* CONFIG_SMP */
1841
1842/*
1843 * UP percpu area setup.
1844 *
1845 * UP always uses km-based percpu allocator with identity mapping.
1846 * Static percpu variables are indistinguishable from the usual static
1847 * variables and don't require any special preparation.
1848 */
1849void __init setup_per_cpu_areas(void)
1850{
1851 const size_t unit_size =
1852 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
1853 PERCPU_DYNAMIC_RESERVE));
1854 struct pcpu_alloc_info *ai;
1855 void *fc;
1856
1857 ai = pcpu_alloc_alloc_info(1, 1);
1858 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1859 if (!ai || !fc)
1860 panic("Failed to allocate memory for percpu areas.");
1861
1862 ai->dyn_size = unit_size;
1863 ai->unit_size = unit_size;
1864 ai->atom_size = unit_size;
1865 ai->alloc_size = unit_size;
1866 ai->groups[0].nr_units = 1;
1867 ai->groups[0].cpu_map[0] = 0;
1868
1869 if (pcpu_setup_first_chunk(ai, fc) < 0)
1870 panic("Failed to initialize percpu areas.");
1871}
1872
1873#endif /* CONFIG_SMP */
1874
1875/*
1876 * First and reserved chunks are initialized with temporary allocation
1877 * map in initdata so that they can be used before slab is online.
1878 * This function is called after slab is brought up and replaces those
1879 * with properly allocated maps.
1880 */
1881void __init percpu_init_late(void)
1882{
1883 struct pcpu_chunk *target_chunks[] =
1884 { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
1885 struct pcpu_chunk *chunk;
1886 unsigned long flags;
1887 int i;
1888
1889 for (i = 0; (chunk = target_chunks[i]); i++) {
1890 int *map;
1891 const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
1892
1893 BUILD_BUG_ON(size > PAGE_SIZE);
1894
1895 map = pcpu_mem_alloc(size);
1896 BUG_ON(!map);
1897
1898 spin_lock_irqsave(&pcpu_lock, flags);
1899 memcpy(map, chunk->map, size);
1900 chunk->map = map;
1901 spin_unlock_irqrestore(&pcpu_lock, flags);
1902 }
1903}
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
deleted file mode 100644
index c4351c7f57d..00000000000
--- a/mm/percpu_up.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
3 */
4
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <linux/slab.h>
8
9void __percpu *__alloc_percpu(size_t size, size_t align)
10{
11 /*
12 * Can't easily make larger alignment work with kmalloc. WARN
13 * on it. Larger alignment should only be used for module
14 * percpu sections on SMP for which this path isn't used.
15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return kzalloc(size, GFP_KERNEL);
18}
19EXPORT_SYMBOL_GPL(__alloc_percpu);
20
21void free_percpu(void __percpu *p)
22{
23 kfree(p);
24}
25EXPORT_SYMBOL_GPL(free_percpu);
26
27phys_addr_t per_cpu_ptr_to_phys(void *addr)
28{
29 return __pa(addr);
30}
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea..1a8bf76bfd0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/memcontrol.h> 56#include <linux/memcontrol.h>
57#include <linux/mmu_notifier.h> 57#include <linux/mmu_notifier.h>
58#include <linux/migrate.h> 58#include <linux/migrate.h>
59#include <linux/hugetlb.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -79,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 80 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80} 81}
81 82
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 83static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{ 84{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 85 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85} 86}
@@ -132,9 +133,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
132 if (unlikely(!anon_vma)) 133 if (unlikely(!anon_vma))
133 goto out_enomem_free_avc; 134 goto out_enomem_free_avc;
134 allocated = anon_vma; 135 allocated = anon_vma;
136 /*
137 * This VMA had no anon_vma yet. This anon_vma is
138 * the root of any anon_vma tree that might form.
139 */
140 anon_vma->root = anon_vma;
135 } 141 }
136 142
137 spin_lock(&anon_vma->lock); 143 anon_vma_lock(anon_vma);
138 /* page_table_lock to protect against threads */ 144 /* page_table_lock to protect against threads */
139 spin_lock(&mm->page_table_lock); 145 spin_lock(&mm->page_table_lock);
140 if (likely(!vma->anon_vma)) { 146 if (likely(!vma->anon_vma)) {
@@ -142,12 +148,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
142 avc->anon_vma = anon_vma; 148 avc->anon_vma = anon_vma;
143 avc->vma = vma; 149 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain); 150 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head); 151 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
146 allocated = NULL; 152 allocated = NULL;
147 avc = NULL; 153 avc = NULL;
148 } 154 }
149 spin_unlock(&mm->page_table_lock); 155 spin_unlock(&mm->page_table_lock);
150 spin_unlock(&anon_vma->lock); 156 anon_vma_unlock(anon_vma);
151 157
152 if (unlikely(allocated)) 158 if (unlikely(allocated))
153 anon_vma_free(allocated); 159 anon_vma_free(allocated);
@@ -170,9 +176,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
170 avc->anon_vma = anon_vma; 176 avc->anon_vma = anon_vma;
171 list_add(&avc->same_vma, &vma->anon_vma_chain); 177 list_add(&avc->same_vma, &vma->anon_vma_chain);
172 178
173 spin_lock(&anon_vma->lock); 179 anon_vma_lock(anon_vma);
174 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 180 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
175 spin_unlock(&anon_vma->lock); 181 anon_vma_unlock(anon_vma);
176} 182}
177 183
178/* 184/*
@@ -224,9 +230,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
224 avc = anon_vma_chain_alloc(); 230 avc = anon_vma_chain_alloc();
225 if (!avc) 231 if (!avc)
226 goto out_error_free_anon_vma; 232 goto out_error_free_anon_vma;
227 anon_vma_chain_link(vma, avc, anon_vma); 233
234 /*
235 * The root anon_vma's spinlock is the lock actually used when we
236 * lock any of the anon_vmas in this anon_vma tree.
237 */
238 anon_vma->root = pvma->anon_vma->root;
239 /*
240 * With KSM refcounts, an anon_vma can stay around longer than the
241 * process it belongs to. The root anon_vma needs to be pinned
242 * until this anon_vma is freed, because the lock lives in the root.
243 */
244 get_anon_vma(anon_vma->root);
228 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 245 /* Mark this anon_vma as the one where our new (COWed) pages go. */
229 vma->anon_vma = anon_vma; 246 vma->anon_vma = anon_vma;
247 anon_vma_chain_link(vma, avc, anon_vma);
230 248
231 return 0; 249 return 0;
232 250
@@ -246,22 +264,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
246 if (!anon_vma) 264 if (!anon_vma)
247 return; 265 return;
248 266
249 spin_lock(&anon_vma->lock); 267 anon_vma_lock(anon_vma);
250 list_del(&anon_vma_chain->same_anon_vma); 268 list_del(&anon_vma_chain->same_anon_vma);
251 269
252 /* We must garbage collect the anon_vma if it's empty */ 270 /* We must garbage collect the anon_vma if it's empty */
253 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 271 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
254 spin_unlock(&anon_vma->lock); 272 anon_vma_unlock(anon_vma);
255 273
256 if (empty) 274 if (empty) {
275 /* We no longer need the root anon_vma */
276 if (anon_vma->root != anon_vma)
277 drop_anon_vma(anon_vma->root);
257 anon_vma_free(anon_vma); 278 anon_vma_free(anon_vma);
279 }
258} 280}
259 281
260void unlink_anon_vmas(struct vm_area_struct *vma) 282void unlink_anon_vmas(struct vm_area_struct *vma)
261{ 283{
262 struct anon_vma_chain *avc, *next; 284 struct anon_vma_chain *avc, *next;
263 285
264 /* Unlink each anon_vma chained to the VMA. */ 286 /*
287 * Unlink each anon_vma chained to the VMA. This list is ordered
288 * from newest to oldest, ensuring the root anon_vma gets freed last.
289 */
265 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 290 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
266 anon_vma_unlink(avc); 291 anon_vma_unlink(avc);
267 list_del(&avc->same_vma); 292 list_del(&avc->same_vma);
@@ -289,9 +314,9 @@ void __init anon_vma_init(void)
289 * Getting a lock on a stable anon_vma from a page off the LRU is 314 * Getting a lock on a stable anon_vma from a page off the LRU is
290 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 315 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
291 */ 316 */
292struct anon_vma *page_lock_anon_vma(struct page *page) 317struct anon_vma *__page_lock_anon_vma(struct page *page)
293{ 318{
294 struct anon_vma *anon_vma; 319 struct anon_vma *anon_vma, *root_anon_vma;
295 unsigned long anon_mapping; 320 unsigned long anon_mapping;
296 321
297 rcu_read_lock(); 322 rcu_read_lock();
@@ -302,16 +327,31 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
302 goto out; 327 goto out;
303 328
304 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 329 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
305 spin_lock(&anon_vma->lock); 330 root_anon_vma = ACCESS_ONCE(anon_vma->root);
306 return anon_vma; 331 spin_lock(&root_anon_vma->lock);
332
333 /*
334 * If this page is still mapped, then its anon_vma cannot have been
335 * freed. But if it has been unmapped, we have no security against
336 * the anon_vma structure being freed and reused (for another anon_vma:
337 * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
338 * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
339 * anon_vma->root before page_unlock_anon_vma() is called to unlock.
340 */
341 if (page_mapped(page))
342 return anon_vma;
343
344 spin_unlock(&root_anon_vma->lock);
307out: 345out:
308 rcu_read_unlock(); 346 rcu_read_unlock();
309 return NULL; 347 return NULL;
310} 348}
311 349
312void page_unlock_anon_vma(struct anon_vma *anon_vma) 350void page_unlock_anon_vma(struct anon_vma *anon_vma)
351 __releases(&anon_vma->root->lock)
352 __releases(RCU)
313{ 353{
314 spin_unlock(&anon_vma->lock); 354 anon_vma_unlock(anon_vma);
315 rcu_read_unlock(); 355 rcu_read_unlock();
316} 356}
317 357
@@ -326,6 +366,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
326 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 366 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
327 unsigned long address; 367 unsigned long address;
328 368
369 if (unlikely(is_vm_hugetlb_page(vma)))
370 pgoff = page->index << huge_page_order(page_hstate(page));
329 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 371 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
330 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 372 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
331 /* page should be within @vma mapping range */ 373 /* page should be within @vma mapping range */
@@ -340,9 +382,16 @@ vma_address(struct page *page, struct vm_area_struct *vma)
340 */ 382 */
341unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 383unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
342{ 384{
343 if (PageAnon(page)) 385 if (PageAnon(page)) {
344 ; 386 struct anon_vma *page__anon_vma = page_anon_vma(page);
345 else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 387 /*
388 * Note: swapoff's unuse_vma() is more efficient with this
389 * check, and needs it to match anon_vma when KSM is active.
390 */
391 if (!vma->anon_vma || !page__anon_vma ||
392 vma->anon_vma->root != page__anon_vma->root)
393 return -EFAULT;
394 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
346 if (!vma->vm_file || 395 if (!vma->vm_file ||
347 vma->vm_file->f_mapping != page->mapping) 396 vma->vm_file->f_mapping != page->mapping)
348 return -EFAULT; 397 return -EFAULT;
@@ -360,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
360 * 409 *
361 * On success returns with pte mapped and locked. 410 * On success returns with pte mapped and locked.
362 */ 411 */
363pte_t *page_check_address(struct page *page, struct mm_struct *mm, 412pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
364 unsigned long address, spinlock_t **ptlp, int sync) 413 unsigned long address, spinlock_t **ptlp, int sync)
365{ 414{
366 pgd_t *pgd; 415 pgd_t *pgd;
@@ -369,6 +418,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
369 pte_t *pte; 418 pte_t *pte;
370 spinlock_t *ptl; 419 spinlock_t *ptl;
371 420
421 if (unlikely(PageHuge(page))) {
422 pte = huge_pte_offset(mm, address);
423 ptl = &mm->page_table_lock;
424 goto check;
425 }
426
372 pgd = pgd_offset(mm, address); 427 pgd = pgd_offset(mm, address);
373 if (!pgd_present(*pgd)) 428 if (!pgd_present(*pgd))
374 return NULL; 429 return NULL;
@@ -389,6 +444,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
389 } 444 }
390 445
391 ptl = pte_lockptr(mm, pmd); 446 ptl = pte_lockptr(mm, pmd);
447check:
392 spin_lock(ptl); 448 spin_lock(ptl);
393 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 449 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
394 *ptlp = ptl; 450 *ptlp = ptl;
@@ -691,7 +747,7 @@ int page_mkclean(struct page *page)
691 if (mapping) { 747 if (mapping) {
692 ret = page_mkclean_file(mapping, page); 748 ret = page_mkclean_file(mapping, page);
693 if (page_test_dirty(page)) { 749 if (page_test_dirty(page)) {
694 page_clear_dirty(page); 750 page_clear_dirty(page, 1);
695 ret = 1; 751 ret = 1;
696 } 752 }
697 } 753 }
@@ -726,10 +782,10 @@ void page_move_anon_rmap(struct page *page,
726} 782}
727 783
728/** 784/**
729 * __page_set_anon_rmap - setup new anonymous rmap 785 * __page_set_anon_rmap - set up new anonymous rmap
730 * @page: the page to add the mapping to 786 * @page: Page to add to rmap
731 * @vma: the vm area in which the mapping is added 787 * @vma: VM area to add page to.
732 * @address: the user virtual address mapped 788 * @address: User virtual address of the mapping
733 * @exclusive: the page is exclusively owned by the current process 789 * @exclusive: the page is exclusively owned by the current process
734 */ 790 */
735static void __page_set_anon_rmap(struct page *page, 791static void __page_set_anon_rmap(struct page *page,
@@ -739,19 +795,16 @@ static void __page_set_anon_rmap(struct page *page,
739 795
740 BUG_ON(!anon_vma); 796 BUG_ON(!anon_vma);
741 797
798 if (PageAnon(page))
799 return;
800
742 /* 801 /*
743 * If the page isn't exclusively mapped into this vma, 802 * If the page isn't exclusively mapped into this vma,
744 * we must use the _oldest_ possible anon_vma for the 803 * we must use the _oldest_ possible anon_vma for the
745 * page mapping! 804 * page mapping!
746 *
747 * So take the last AVC chain entry in the vma, which is
748 * the deepest ancestor, and use the anon_vma from that.
749 */ 805 */
750 if (!exclusive) { 806 if (!exclusive)
751 struct anon_vma_chain *avc; 807 anon_vma = anon_vma->root;
752 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
753 anon_vma = avc->anon_vma;
754 }
755 808
756 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 809 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
757 page->mapping = (struct address_space *) anon_vma; 810 page->mapping = (struct address_space *) anon_vma;
@@ -780,6 +833,7 @@ static void __page_check_anon_rmap(struct page *page,
780 * are initially only visible via the pagetables, and the pte is locked 833 * are initially only visible via the pagetables, and the pte is locked
781 * over the call to page_add_new_anon_rmap. 834 * over the call to page_add_new_anon_rmap.
782 */ 835 */
836 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
783 BUG_ON(page->index != linear_page_index(vma, address)); 837 BUG_ON(page->index != linear_page_index(vma, address));
784#endif 838#endif
785} 839}
@@ -798,6 +852,17 @@ static void __page_check_anon_rmap(struct page *page,
798void page_add_anon_rmap(struct page *page, 852void page_add_anon_rmap(struct page *page,
799 struct vm_area_struct *vma, unsigned long address) 853 struct vm_area_struct *vma, unsigned long address)
800{ 854{
855 do_page_add_anon_rmap(page, vma, address, 0);
856}
857
858/*
859 * Special version of the above for do_swap_page, which often runs
860 * into pages that are exclusively owned by the current process.
861 * Everybody else should continue to use page_add_anon_rmap above.
862 */
863void do_page_add_anon_rmap(struct page *page,
864 struct vm_area_struct *vma, unsigned long address, int exclusive)
865{
801 int first = atomic_inc_and_test(&page->_mapcount); 866 int first = atomic_inc_and_test(&page->_mapcount);
802 if (first) 867 if (first)
803 __inc_zone_page_state(page, NR_ANON_PAGES); 868 __inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +872,7 @@ void page_add_anon_rmap(struct page *page,
807 VM_BUG_ON(!PageLocked(page)); 872 VM_BUG_ON(!PageLocked(page));
808 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 873 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
809 if (first) 874 if (first)
810 __page_set_anon_rmap(page, vma, address, 0); 875 __page_set_anon_rmap(page, vma, address, exclusive);
811 else 876 else
812 __page_check_anon_rmap(page, vma, address); 877 __page_check_anon_rmap(page, vma, address);
813} 878}
@@ -870,9 +935,15 @@ void page_remove_rmap(struct page *page)
870 * containing the swap entry, but page not yet written to swap. 935 * containing the swap entry, but page not yet written to swap.
871 */ 936 */
872 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { 937 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
873 page_clear_dirty(page); 938 page_clear_dirty(page, 1);
874 set_page_dirty(page); 939 set_page_dirty(page);
875 } 940 }
941 /*
942 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
943 * and not charged by memcg for now.
944 */
945 if (unlikely(PageHuge(page)))
946 return;
876 if (PageAnon(page)) { 947 if (PageAnon(page)) {
877 mem_cgroup_uncharge_page(page); 948 mem_cgroup_uncharge_page(page);
878 __dec_zone_page_state(page, NR_ANON_PAGES); 949 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1368,6 +1439,42 @@ int try_to_munlock(struct page *page)
1368 return try_to_unmap_file(page, TTU_MUNLOCK); 1439 return try_to_unmap_file(page, TTU_MUNLOCK);
1369} 1440}
1370 1441
1442#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
1443/*
1444 * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
1445 * if necessary. Be careful to do all the tests under the lock. Once
1446 * we know we are the last user, nobody else can get a reference and we
1447 * can do the freeing without the lock.
1448 */
1449void drop_anon_vma(struct anon_vma *anon_vma)
1450{
1451 BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
1452 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
1453 struct anon_vma *root = anon_vma->root;
1454 int empty = list_empty(&anon_vma->head);
1455 int last_root_user = 0;
1456 int root_empty = 0;
1457
1458 /*
1459 * The refcount on a non-root anon_vma got dropped. Drop
1460 * the refcount on the root and check if we need to free it.
1461 */
1462 if (empty && anon_vma != root) {
1463 BUG_ON(atomic_read(&root->external_refcount) <= 0);
1464 last_root_user = atomic_dec_and_test(&root->external_refcount);
1465 root_empty = list_empty(&root->head);
1466 }
1467 anon_vma_unlock(anon_vma);
1468
1469 if (empty) {
1470 anon_vma_free(anon_vma);
1471 if (root_empty && last_root_user)
1472 anon_vma_free(root);
1473 }
1474 }
1475}
1476#endif
1477
1371#ifdef CONFIG_MIGRATION 1478#ifdef CONFIG_MIGRATION
1372/* 1479/*
1373 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1480 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1389,7 +1496,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1389 anon_vma = page_anon_vma(page); 1496 anon_vma = page_anon_vma(page);
1390 if (!anon_vma) 1497 if (!anon_vma)
1391 return ret; 1498 return ret;
1392 spin_lock(&anon_vma->lock); 1499 anon_vma_lock(anon_vma);
1393 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1500 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1394 struct vm_area_struct *vma = avc->vma; 1501 struct vm_area_struct *vma = avc->vma;
1395 unsigned long address = vma_address(page, vma); 1502 unsigned long address = vma_address(page, vma);
@@ -1399,7 +1506,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1399 if (ret != SWAP_AGAIN) 1506 if (ret != SWAP_AGAIN)
1400 break; 1507 break;
1401 } 1508 }
1402 spin_unlock(&anon_vma->lock); 1509 anon_vma_unlock(anon_vma);
1403 return ret; 1510 return ret;
1404} 1511}
1405 1512
@@ -1445,3 +1552,49 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1445 return rmap_walk_file(page, rmap_one, arg); 1552 return rmap_walk_file(page, rmap_one, arg);
1446} 1553}
1447#endif /* CONFIG_MIGRATION */ 1554#endif /* CONFIG_MIGRATION */
1555
1556#ifdef CONFIG_HUGETLB_PAGE
1557/*
1558 * The following three functions are for anonymous (private mapped) hugepages.
1559 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1560 * and no lru code, because we handle hugepages differently from common pages.
1561 */
1562static void __hugepage_set_anon_rmap(struct page *page,
1563 struct vm_area_struct *vma, unsigned long address, int exclusive)
1564{
1565 struct anon_vma *anon_vma = vma->anon_vma;
1566
1567 BUG_ON(!anon_vma);
1568
1569 if (PageAnon(page))
1570 return;
1571 if (!exclusive)
1572 anon_vma = anon_vma->root;
1573
1574 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1575 page->mapping = (struct address_space *) anon_vma;
1576 page->index = linear_page_index(vma, address);
1577}
1578
1579void hugepage_add_anon_rmap(struct page *page,
1580 struct vm_area_struct *vma, unsigned long address)
1581{
1582 struct anon_vma *anon_vma = vma->anon_vma;
1583 int first;
1584
1585 BUG_ON(!PageLocked(page));
1586 BUG_ON(!anon_vma);
1587 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1588 first = atomic_inc_and_test(&page->_mapcount);
1589 if (first)
1590 __hugepage_set_anon_rmap(page, vma, address, 0);
1591}
1592
1593void hugepage_add_new_anon_rmap(struct page *page,
1594 struct vm_area_struct *vma, unsigned long address)
1595{
1596 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1597 atomic_set(&page->_mapcount, 0);
1598 __hugepage_set_anon_rmap(page, vma, address, 1);
1599}
1600#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db..47fdeeb9d63 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
28#include <linux/file.h> 28#include <linux/file.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/percpu_counter.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
32 33
33static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
233{ 234{
234 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 235 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
235 if (sbinfo->max_blocks) { 236 if (sbinfo->max_blocks) {
236 spin_lock(&sbinfo->stat_lock); 237 percpu_counter_add(&sbinfo->used_blocks, -pages);
237 sbinfo->free_blocks += pages; 238 spin_lock(&inode->i_lock);
238 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 239 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
239 spin_unlock(&sbinfo->stat_lock); 240 spin_unlock(&inode->i_lock);
240 } 241 }
241} 242}
242 243
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
416 if (sgp == SGP_READ) 417 if (sgp == SGP_READ)
417 return shmem_swp_map(ZERO_PAGE(0)); 418 return shmem_swp_map(ZERO_PAGE(0));
418 /* 419 /*
419 * Test free_blocks against 1 not 0, since we have 1 data 420 * Test used_blocks against 1 less max_blocks, since we have 1 data
420 * page (and perhaps indirect index pages) yet to allocate: 421 * page (and perhaps indirect index pages) yet to allocate:
421 * a waste to allocate index if we cannot allocate data. 422 * a waste to allocate index if we cannot allocate data.
422 */ 423 */
423 if (sbinfo->max_blocks) { 424 if (sbinfo->max_blocks) {
424 spin_lock(&sbinfo->stat_lock); 425 if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
425 if (sbinfo->free_blocks <= 1) {
426 spin_unlock(&sbinfo->stat_lock);
427 return ERR_PTR(-ENOSPC); 426 return ERR_PTR(-ENOSPC);
428 } 427 percpu_counter_inc(&sbinfo->used_blocks);
429 sbinfo->free_blocks--; 428 spin_lock(&inode->i_lock);
430 inode->i_blocks += BLOCKS_PER_PAGE; 429 inode->i_blocks += BLOCKS_PER_PAGE;
431 spin_unlock(&sbinfo->stat_lock); 430 spin_unlock(&inode->i_lock);
432 } 431 }
433 432
434 spin_unlock(&info->lock); 433 spin_unlock(&info->lock);
@@ -767,6 +766,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
767 loff_t newsize = attr->ia_size; 766 loff_t newsize = attr->ia_size;
768 int error; 767 int error;
769 768
769 error = inode_change_ok(inode, attr);
770 if (error)
771 return error;
772
770 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) 773 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
771 && newsize != inode->i_size) { 774 && newsize != inode->i_size) {
772 struct page *page = NULL; 775 struct page *page = NULL;
@@ -801,25 +804,22 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
801 } 804 }
802 } 805 }
803 806
804 error = simple_setsize(inode, newsize); 807 /* XXX(truncate): truncate_setsize should be called last */
808 truncate_setsize(inode, newsize);
805 if (page) 809 if (page)
806 page_cache_release(page); 810 page_cache_release(page);
807 if (error)
808 return error;
809 shmem_truncate_range(inode, newsize, (loff_t)-1); 811 shmem_truncate_range(inode, newsize, (loff_t)-1);
810 } 812 }
811 813
812 error = inode_change_ok(inode, attr); 814 setattr_copy(inode, attr);
813 if (!error)
814 generic_setattr(inode, attr);
815#ifdef CONFIG_TMPFS_POSIX_ACL 815#ifdef CONFIG_TMPFS_POSIX_ACL
816 if (!error && (attr->ia_valid & ATTR_MODE)) 816 if (attr->ia_valid & ATTR_MODE)
817 error = generic_acl_chmod(inode); 817 error = generic_acl_chmod(inode);
818#endif 818#endif
819 return error; 819 return error;
820} 820}
821 821
822static void shmem_delete_inode(struct inode *inode) 822static void shmem_evict_inode(struct inode *inode)
823{ 823{
824 struct shmem_inode_info *info = SHMEM_I(inode); 824 struct shmem_inode_info *info = SHMEM_I(inode);
825 825
@@ -836,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode)
836 } 836 }
837 BUG_ON(inode->i_blocks); 837 BUG_ON(inode->i_blocks);
838 shmem_free_inode(inode->i_sb); 838 shmem_free_inode(inode->i_sb);
839 clear_inode(inode); 839 end_writeback(inode);
840} 840}
841 841
842static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 842static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
@@ -933,7 +933,7 @@ found:
933 933
934 /* 934 /*
935 * Move _head_ to start search for next from here. 935 * Move _head_ to start search for next from here.
936 * But be careful: shmem_delete_inode checks list_empty without taking 936 * But be careful: shmem_evict_inode checks list_empty without taking
937 * mutex, and there's an instant in list_move_tail when info->swaplist 937 * mutex, and there's an instant in list_move_tail when info->swaplist
938 * would appear empty, if it were the only one on shmem_swaplist. We 938 * would appear empty, if it were the only one on shmem_swaplist. We
939 * could avoid doing it if inode NULL; or use this minor optimization. 939 * could avoid doing it if inode NULL; or use this minor optimization.
@@ -1223,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1223 struct shmem_sb_info *sbinfo; 1223 struct shmem_sb_info *sbinfo;
1224 struct page *filepage = *pagep; 1224 struct page *filepage = *pagep;
1225 struct page *swappage; 1225 struct page *swappage;
1226 struct page *prealloc_page = NULL;
1226 swp_entry_t *entry; 1227 swp_entry_t *entry;
1227 swp_entry_t swap; 1228 swp_entry_t swap;
1228 gfp_t gfp; 1229 gfp_t gfp;
@@ -1247,7 +1248,6 @@ repeat:
1247 filepage = find_lock_page(mapping, idx); 1248 filepage = find_lock_page(mapping, idx);
1248 if (filepage && PageUptodate(filepage)) 1249 if (filepage && PageUptodate(filepage))
1249 goto done; 1250 goto done;
1250 error = 0;
1251 gfp = mapping_gfp_mask(mapping); 1251 gfp = mapping_gfp_mask(mapping);
1252 if (!filepage) { 1252 if (!filepage) {
1253 /* 1253 /*
@@ -1258,7 +1258,19 @@ repeat:
1258 if (error) 1258 if (error)
1259 goto failed; 1259 goto failed;
1260 radix_tree_preload_end(); 1260 radix_tree_preload_end();
1261 if (sgp != SGP_READ && !prealloc_page) {
1262 /* We don't care if this fails */
1263 prealloc_page = shmem_alloc_page(gfp, info, idx);
1264 if (prealloc_page) {
1265 if (mem_cgroup_cache_charge(prealloc_page,
1266 current->mm, GFP_KERNEL)) {
1267 page_cache_release(prealloc_page);
1268 prealloc_page = NULL;
1269 }
1270 }
1271 }
1261 } 1272 }
1273 error = 0;
1262 1274
1263 spin_lock(&info->lock); 1275 spin_lock(&info->lock);
1264 shmem_recalc_inode(inode); 1276 shmem_recalc_inode(inode);
@@ -1387,17 +1399,16 @@ repeat:
1387 shmem_swp_unmap(entry); 1399 shmem_swp_unmap(entry);
1388 sbinfo = SHMEM_SB(inode->i_sb); 1400 sbinfo = SHMEM_SB(inode->i_sb);
1389 if (sbinfo->max_blocks) { 1401 if (sbinfo->max_blocks) {
1390 spin_lock(&sbinfo->stat_lock); 1402 if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
1391 if (sbinfo->free_blocks == 0 ||
1392 shmem_acct_block(info->flags)) { 1403 shmem_acct_block(info->flags)) {
1393 spin_unlock(&sbinfo->stat_lock);
1394 spin_unlock(&info->lock); 1404 spin_unlock(&info->lock);
1395 error = -ENOSPC; 1405 error = -ENOSPC;
1396 goto failed; 1406 goto failed;
1397 } 1407 }
1398 sbinfo->free_blocks--; 1408 percpu_counter_inc(&sbinfo->used_blocks);
1409 spin_lock(&inode->i_lock);
1399 inode->i_blocks += BLOCKS_PER_PAGE; 1410 inode->i_blocks += BLOCKS_PER_PAGE;
1400 spin_unlock(&sbinfo->stat_lock); 1411 spin_unlock(&inode->i_lock);
1401 } else if (shmem_acct_block(info->flags)) { 1412 } else if (shmem_acct_block(info->flags)) {
1402 spin_unlock(&info->lock); 1413 spin_unlock(&info->lock);
1403 error = -ENOSPC; 1414 error = -ENOSPC;
@@ -1407,28 +1418,38 @@ repeat:
1407 if (!filepage) { 1418 if (!filepage) {
1408 int ret; 1419 int ret;
1409 1420
1410 spin_unlock(&info->lock); 1421 if (!prealloc_page) {
1411 filepage = shmem_alloc_page(gfp, info, idx); 1422 spin_unlock(&info->lock);
1412 if (!filepage) { 1423 filepage = shmem_alloc_page(gfp, info, idx);
1413 shmem_unacct_blocks(info->flags, 1); 1424 if (!filepage) {
1414 shmem_free_blocks(inode, 1); 1425 shmem_unacct_blocks(info->flags, 1);
1415 error = -ENOMEM; 1426 shmem_free_blocks(inode, 1);
1416 goto failed; 1427 error = -ENOMEM;
1417 } 1428 goto failed;
1418 SetPageSwapBacked(filepage); 1429 }
1430 SetPageSwapBacked(filepage);
1419 1431
1420 /* Precharge page while we can wait, compensate after */ 1432 /*
1421 error = mem_cgroup_cache_charge(filepage, current->mm, 1433 * Precharge page while we can wait, compensate
1422 GFP_KERNEL); 1434 * after
1423 if (error) { 1435 */
1424 page_cache_release(filepage); 1436 error = mem_cgroup_cache_charge(filepage,
1425 shmem_unacct_blocks(info->flags, 1); 1437 current->mm, GFP_KERNEL);
1426 shmem_free_blocks(inode, 1); 1438 if (error) {
1427 filepage = NULL; 1439 page_cache_release(filepage);
1428 goto failed; 1440 shmem_unacct_blocks(info->flags, 1);
1441 shmem_free_blocks(inode, 1);
1442 filepage = NULL;
1443 goto failed;
1444 }
1445
1446 spin_lock(&info->lock);
1447 } else {
1448 filepage = prealloc_page;
1449 prealloc_page = NULL;
1450 SetPageSwapBacked(filepage);
1429 } 1451 }
1430 1452
1431 spin_lock(&info->lock);
1432 entry = shmem_swp_alloc(info, idx, sgp); 1453 entry = shmem_swp_alloc(info, idx, sgp);
1433 if (IS_ERR(entry)) 1454 if (IS_ERR(entry))
1434 error = PTR_ERR(entry); 1455 error = PTR_ERR(entry);
@@ -1469,13 +1490,19 @@ repeat:
1469 } 1490 }
1470done: 1491done:
1471 *pagep = filepage; 1492 *pagep = filepage;
1472 return 0; 1493 error = 0;
1494 goto out;
1473 1495
1474failed: 1496failed:
1475 if (*pagep != filepage) { 1497 if (*pagep != filepage) {
1476 unlock_page(filepage); 1498 unlock_page(filepage);
1477 page_cache_release(filepage); 1499 page_cache_release(filepage);
1478 } 1500 }
1501out:
1502 if (prealloc_page) {
1503 mem_cgroup_uncharge_cache_page(prealloc_page);
1504 page_cache_release(prealloc_page);
1505 }
1479 return error; 1506 return error;
1480} 1507}
1481 1508
@@ -1559,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1559 1586
1560 inode = new_inode(sb); 1587 inode = new_inode(sb);
1561 if (inode) { 1588 if (inode) {
1589 inode->i_ino = get_next_ino();
1562 inode_init_owner(inode, dir, mode); 1590 inode_init_owner(inode, dir, mode);
1563 inode->i_blocks = 0; 1591 inode->i_blocks = 0;
1564 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1592 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1791,17 +1819,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1791 buf->f_type = TMPFS_MAGIC; 1819 buf->f_type = TMPFS_MAGIC;
1792 buf->f_bsize = PAGE_CACHE_SIZE; 1820 buf->f_bsize = PAGE_CACHE_SIZE;
1793 buf->f_namelen = NAME_MAX; 1821 buf->f_namelen = NAME_MAX;
1794 spin_lock(&sbinfo->stat_lock);
1795 if (sbinfo->max_blocks) { 1822 if (sbinfo->max_blocks) {
1796 buf->f_blocks = sbinfo->max_blocks; 1823 buf->f_blocks = sbinfo->max_blocks;
1797 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 1824 buf->f_bavail = buf->f_bfree =
1825 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
1798 } 1826 }
1799 if (sbinfo->max_inodes) { 1827 if (sbinfo->max_inodes) {
1800 buf->f_files = sbinfo->max_inodes; 1828 buf->f_files = sbinfo->max_inodes;
1801 buf->f_ffree = sbinfo->free_inodes; 1829 buf->f_ffree = sbinfo->free_inodes;
1802 } 1830 }
1803 /* else leave those fields 0 like simple_statfs */ 1831 /* else leave those fields 0 like simple_statfs */
1804 spin_unlock(&sbinfo->stat_lock);
1805 return 0; 1832 return 0;
1806} 1833}
1807 1834
@@ -1877,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1877 dir->i_size += BOGO_DIRENT_SIZE; 1904 dir->i_size += BOGO_DIRENT_SIZE;
1878 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1905 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1879 inc_nlink(inode); 1906 inc_nlink(inode);
1880 atomic_inc(&inode->i_count); /* New dentry reference */ 1907 ihold(inode); /* New dentry reference */
1881 dget(dentry); /* Extra pinning count for the created dentry */ 1908 dget(dentry); /* Extra pinning count for the created dentry */
1882 d_instantiate(dentry, inode); 1909 d_instantiate(dentry, inode);
1883out: 1910out:
@@ -2120,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2120 if (*len < 3) 2147 if (*len < 3)
2121 return 255; 2148 return 255;
2122 2149
2123 if (hlist_unhashed(&inode->i_hash)) { 2150 if (inode_unhashed(inode)) {
2124 /* Unfortunately insert_inode_hash is not idempotent, 2151 /* Unfortunately insert_inode_hash is not idempotent,
2125 * so as we hash inodes here rather than at creation 2152 * so as we hash inodes here rather than at creation
2126 * time, we need a lock to ensure we only try 2153 * time, we need a lock to ensure we only try
@@ -2128,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2128 */ 2155 */
2129 static DEFINE_SPINLOCK(lock); 2156 static DEFINE_SPINLOCK(lock);
2130 spin_lock(&lock); 2157 spin_lock(&lock);
2131 if (hlist_unhashed(&inode->i_hash)) 2158 if (inode_unhashed(inode))
2132 __insert_inode_hash(inode, 2159 __insert_inode_hash(inode,
2133 inode->i_ino + inode->i_generation); 2160 inode->i_ino + inode->i_generation);
2134 spin_unlock(&lock); 2161 spin_unlock(&lock);
@@ -2242,7 +2269,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2242{ 2269{
2243 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2270 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2244 struct shmem_sb_info config = *sbinfo; 2271 struct shmem_sb_info config = *sbinfo;
2245 unsigned long blocks;
2246 unsigned long inodes; 2272 unsigned long inodes;
2247 int error = -EINVAL; 2273 int error = -EINVAL;
2248 2274
@@ -2250,9 +2276,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2250 return error; 2276 return error;
2251 2277
2252 spin_lock(&sbinfo->stat_lock); 2278 spin_lock(&sbinfo->stat_lock);
2253 blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2254 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 2279 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2255 if (config.max_blocks < blocks) 2280 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2256 goto out; 2281 goto out;
2257 if (config.max_inodes < inodes) 2282 if (config.max_inodes < inodes)
2258 goto out; 2283 goto out;
@@ -2269,7 +2294,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2269 2294
2270 error = 0; 2295 error = 0;
2271 sbinfo->max_blocks = config.max_blocks; 2296 sbinfo->max_blocks = config.max_blocks;
2272 sbinfo->free_blocks = config.max_blocks - blocks;
2273 sbinfo->max_inodes = config.max_inodes; 2297 sbinfo->max_inodes = config.max_inodes;
2274 sbinfo->free_inodes = config.max_inodes - inodes; 2298 sbinfo->free_inodes = config.max_inodes - inodes;
2275 2299
@@ -2302,7 +2326,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
2302 2326
2303static void shmem_put_super(struct super_block *sb) 2327static void shmem_put_super(struct super_block *sb)
2304{ 2328{
2305 kfree(sb->s_fs_info); 2329 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2330
2331 percpu_counter_destroy(&sbinfo->used_blocks);
2332 kfree(sbinfo);
2306 sb->s_fs_info = NULL; 2333 sb->s_fs_info = NULL;
2307} 2334}
2308 2335
@@ -2344,7 +2371,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2344#endif 2371#endif
2345 2372
2346 spin_lock_init(&sbinfo->stat_lock); 2373 spin_lock_init(&sbinfo->stat_lock);
2347 sbinfo->free_blocks = sbinfo->max_blocks; 2374 if (percpu_counter_init(&sbinfo->used_blocks, 0))
2375 goto failed;
2348 sbinfo->free_inodes = sbinfo->max_inodes; 2376 sbinfo->free_inodes = sbinfo->max_inodes;
2349 2377
2350 sb->s_maxbytes = SHMEM_MAX_BYTES; 2378 sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2496,7 +2524,7 @@ static const struct super_operations shmem_ops = {
2496 .remount_fs = shmem_remount_fs, 2524 .remount_fs = shmem_remount_fs,
2497 .show_options = shmem_show_options, 2525 .show_options = shmem_show_options,
2498#endif 2526#endif
2499 .delete_inode = shmem_delete_inode, 2527 .evict_inode = shmem_evict_inode,
2500 .drop_inode = generic_delete_inode, 2528 .drop_inode = generic_delete_inode,
2501 .put_super = shmem_put_super, 2529 .put_super = shmem_put_super,
2502}; 2530};
@@ -2510,16 +2538,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
2510}; 2538};
2511 2539
2512 2540
2513static int shmem_get_sb(struct file_system_type *fs_type, 2541static struct dentry *shmem_mount(struct file_system_type *fs_type,
2514 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 2542 int flags, const char *dev_name, void *data)
2515{ 2543{
2516 return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); 2544 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2517} 2545}
2518 2546
2519static struct file_system_type tmpfs_fs_type = { 2547static struct file_system_type tmpfs_fs_type = {
2520 .owner = THIS_MODULE, 2548 .owner = THIS_MODULE,
2521 .name = "tmpfs", 2549 .name = "tmpfs",
2522 .get_sb = shmem_get_sb, 2550 .mount = shmem_mount,
2523 .kill_sb = kill_litter_super, 2551 .kill_sb = kill_litter_super,
2524}; 2552};
2525 2553
@@ -2615,7 +2643,7 @@ out:
2615 2643
2616static struct file_system_type tmpfs_fs_type = { 2644static struct file_system_type tmpfs_fs_type = {
2617 .name = "tmpfs", 2645 .name = "tmpfs",
2618 .get_sb = ramfs_get_sb, 2646 .mount = ramfs_mount,
2619 .kill_sb = kill_litter_super, 2647 .kill_sb = kill_litter_super,
2620}; 2648};
2621 2649
diff --git a/mm/slab.c b/mm/slab.c
index e49f8f46f46..b1e40dafbab 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,6 @@
102#include <linux/cpu.h> 102#include <linux/cpu.h>
103#include <linux/sysctl.h> 103#include <linux/sysctl.h>
104#include <linux/module.h> 104#include <linux/module.h>
105#include <linux/kmemtrace.h>
106#include <linux/rcupdate.h> 105#include <linux/rcupdate.h>
107#include <linux/string.h> 106#include <linux/string.h>
108#include <linux/uaccess.h> 107#include <linux/uaccess.h>
@@ -395,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
395#define STATS_DEC_ACTIVE(x) do { } while (0) 394#define STATS_DEC_ACTIVE(x) do { } while (0)
396#define STATS_INC_ALLOCED(x) do { } while (0) 395#define STATS_INC_ALLOCED(x) do { } while (0)
397#define STATS_INC_GROWN(x) do { } while (0) 396#define STATS_INC_GROWN(x) do { } while (0)
398#define STATS_ADD_REAPED(x,y) do { } while (0) 397#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
399#define STATS_SET_HIGH(x) do { } while (0) 398#define STATS_SET_HIGH(x) do { } while (0)
400#define STATS_INC_ERR(x) do { } while (0) 399#define STATS_INC_ERR(x) do { } while (0)
401#define STATS_INC_NODEALLOCS(x) do { } while (0) 400#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -861,7 +860,7 @@ static void __cpuinit start_cpu_timer(int cpu)
861 */ 860 */
862 if (keventd_up() && reap_work->work.func == NULL) { 861 if (keventd_up() && reap_work->work.func == NULL) {
863 init_reap_node(cpu); 862 init_reap_node(cpu);
864 INIT_DELAYED_WORK(reap_work, cache_reap); 863 INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
865 schedule_delayed_work_on(cpu, reap_work, 864 schedule_delayed_work_on(cpu, reap_work,
866 __round_jiffies_relative(HZ, cpu)); 865 __round_jiffies_relative(HZ, cpu));
867 } 866 }
@@ -902,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
902 struct array_cache *from, unsigned int max) 901 struct array_cache *from, unsigned int max)
903{ 902{
904 /* Figure out how many entries to transfer */ 903 /* Figure out how many entries to transfer */
905 int nr = min(min(from->avail, max), to->limit - to->avail); 904 int nr = min3(from->avail, max, to->limit - to->avail);
906 905
907 if (!nr) 906 if (!nr)
908 return 0; 907 return 0;
@@ -2331,8 +2330,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2331 } 2330 }
2332#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2331#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2333 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2332 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2334 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { 2333 && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
2335 cachep->obj_offset += PAGE_SIZE - size; 2334 cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
2336 size = PAGE_SIZE; 2335 size = PAGE_SIZE;
2337 } 2336 }
2338#endif 2337#endif
diff --git a/mm/slob.c b/mm/slob.c
index 23631e2bb57..617b6d6c42c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -66,8 +66,10 @@
66#include <linux/module.h> 66#include <linux/module.h>
67#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
68#include <linux/list.h> 68#include <linux/list.h>
69#include <linux/kmemtrace.h>
70#include <linux/kmemleak.h> 69#include <linux/kmemleak.h>
70
71#include <trace/events/kmem.h>
72
71#include <asm/atomic.h> 73#include <asm/atomic.h>
72 74
73/* 75/*
@@ -394,6 +396,7 @@ static void slob_free(void *block, int size)
394 slob_t *prev, *next, *b = (slob_t *)block; 396 slob_t *prev, *next, *b = (slob_t *)block;
395 slobidx_t units; 397 slobidx_t units;
396 unsigned long flags; 398 unsigned long flags;
399 struct list_head *slob_list;
397 400
398 if (unlikely(ZERO_OR_NULL_PTR(block))) 401 if (unlikely(ZERO_OR_NULL_PTR(block)))
399 return; 402 return;
@@ -422,7 +425,13 @@ static void slob_free(void *block, int size)
422 set_slob(b, units, 425 set_slob(b, units,
423 (void *)((unsigned long)(b + 426 (void *)((unsigned long)(b +
424 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 427 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
425 set_slob_page_free(sp, &free_slob_small); 428 if (size < SLOB_BREAK1)
429 slob_list = &free_slob_small;
430 else if (size < SLOB_BREAK2)
431 slob_list = &free_slob_medium;
432 else
433 slob_list = &free_slob_large;
434 set_slob_page_free(sp, slob_list);
426 goto out; 435 goto out;
427 } 436 }
428 437
@@ -491,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
491 } else { 500 } else {
492 unsigned int order = get_order(size); 501 unsigned int order = get_order(size);
493 502
494 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); 503 if (likely(order))
504 gfp |= __GFP_COMP;
505 ret = slob_new_pages(gfp, order, node);
495 if (ret) { 506 if (ret) {
496 struct page *page; 507 struct page *page;
497 page = virt_to_page(ret); 508 page = virt_to_page(ret);
@@ -639,7 +650,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
639 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { 650 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
640 struct slob_rcu *slob_rcu; 651 struct slob_rcu *slob_rcu;
641 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 652 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
642 INIT_RCU_HEAD(&slob_rcu->head);
643 slob_rcu->size = c->size; 653 slob_rcu->size = c->size;
644 call_rcu(&slob_rcu->head, kmem_rcu_free); 654 call_rcu(&slob_rcu->head, kmem_rcu_free);
645 } else { 655 } else {
diff --git a/mm/slub.c b/mm/slub.c
index 578f68f3c51..8fd5401bb07 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,6 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/kmemtrace.h>
21#include <linux/kmemcheck.h> 20#include <linux/kmemcheck.h>
22#include <linux/cpu.h> 21#include <linux/cpu.h>
23#include <linux/cpuset.h> 22#include <linux/cpuset.h>
@@ -107,11 +106,17 @@
107 * the fast path and disables lockless freelists. 106 * the fast path and disables lockless freelists.
108 */ 107 */
109 108
109#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
110 SLAB_TRACE | SLAB_DEBUG_FREE)
111
112static inline int kmem_cache_debug(struct kmem_cache *s)
113{
110#ifdef CONFIG_SLUB_DEBUG 114#ifdef CONFIG_SLUB_DEBUG
111#define SLABDEBUG 1 115 return unlikely(s->flags & SLAB_DEBUG_FLAGS);
112#else 116#else
113#define SLABDEBUG 0 117 return 0;
114#endif 118#endif
119}
115 120
116/* 121/*
117 * Issues still to be resolved: 122 * Issues still to be resolved:
@@ -162,8 +167,7 @@
162#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 167#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
163 168
164/* Internal SLUB flags */ 169/* Internal SLUB flags */
165#define __OBJECT_POISON 0x80000000 /* Poison object */ 170#define __OBJECT_POISON 0x80000000UL /* Poison object */
166#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
167 171
168static int kmem_size = sizeof(struct kmem_cache); 172static int kmem_size = sizeof(struct kmem_cache);
169 173
@@ -173,7 +177,7 @@ static struct notifier_block slab_notifier;
173 177
174static enum { 178static enum {
175 DOWN, /* No slab functionality available */ 179 DOWN, /* No slab functionality available */
176 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 180 PARTIAL, /* Kmem_cache_node works */
177 UP, /* Everything works but does not show up in sysfs */ 181 UP, /* Everything works but does not show up in sysfs */
178 SYSFS /* Sysfs up */ 182 SYSFS /* Sysfs up */
179} slab_state = DOWN; 183} slab_state = DOWN;
@@ -194,7 +198,7 @@ struct track {
194 198
195enum track_item { TRACK_ALLOC, TRACK_FREE }; 199enum track_item { TRACK_ALLOC, TRACK_FREE };
196 200
197#ifdef CONFIG_SLUB_DEBUG 201#ifdef CONFIG_SYSFS
198static int sysfs_slab_add(struct kmem_cache *); 202static int sysfs_slab_add(struct kmem_cache *);
199static int sysfs_slab_alias(struct kmem_cache *, const char *); 203static int sysfs_slab_alias(struct kmem_cache *, const char *);
200static void sysfs_slab_remove(struct kmem_cache *); 204static void sysfs_slab_remove(struct kmem_cache *);
@@ -205,6 +209,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
205 { return 0; } 209 { return 0; }
206static inline void sysfs_slab_remove(struct kmem_cache *s) 210static inline void sysfs_slab_remove(struct kmem_cache *s)
207{ 211{
212 kfree(s->name);
208 kfree(s); 213 kfree(s);
209} 214}
210 215
@@ -228,11 +233,7 @@ int slab_is_available(void)
228 233
229static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 234static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
230{ 235{
231#ifdef CONFIG_NUMA
232 return s->node[node]; 236 return s->node[node];
233#else
234 return &s->local_node;
235#endif
236} 237}
237 238
238/* Verify that a pointer has an address that is valid within a slab page */ 239/* Verify that a pointer has an address that is valid within a slab page */
@@ -489,7 +490,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
489 dump_stack(); 490 dump_stack();
490} 491}
491 492
492static void init_object(struct kmem_cache *s, void *object, int active) 493static void init_object(struct kmem_cache *s, void *object, u8 val)
493{ 494{
494 u8 *p = object; 495 u8 *p = object;
495 496
@@ -499,9 +500,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
499 } 500 }
500 501
501 if (s->flags & SLAB_RED_ZONE) 502 if (s->flags & SLAB_RED_ZONE)
502 memset(p + s->objsize, 503 memset(p + s->objsize, val, s->inuse - s->objsize);
503 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
504 s->inuse - s->objsize);
505} 504}
506 505
507static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 506static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -636,17 +635,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
636} 635}
637 636
638static int check_object(struct kmem_cache *s, struct page *page, 637static int check_object(struct kmem_cache *s, struct page *page,
639 void *object, int active) 638 void *object, u8 val)
640{ 639{
641 u8 *p = object; 640 u8 *p = object;
642 u8 *endobject = object + s->objsize; 641 u8 *endobject = object + s->objsize;
643 642
644 if (s->flags & SLAB_RED_ZONE) { 643 if (s->flags & SLAB_RED_ZONE) {
645 unsigned int red =
646 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
647
648 if (!check_bytes_and_report(s, page, object, "Redzone", 644 if (!check_bytes_and_report(s, page, object, "Redzone",
649 endobject, red, s->inuse - s->objsize)) 645 endobject, val, s->inuse - s->objsize))
650 return 0; 646 return 0;
651 } else { 647 } else {
652 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 648 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -656,7 +652,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
656 } 652 }
657 653
658 if (s->flags & SLAB_POISON) { 654 if (s->flags & SLAB_POISON) {
659 if (!active && (s->flags & __OBJECT_POISON) && 655 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
660 (!check_bytes_and_report(s, page, p, "Poison", p, 656 (!check_bytes_and_report(s, page, p, "Poison", p,
661 POISON_FREE, s->objsize - 1) || 657 POISON_FREE, s->objsize - 1) ||
662 !check_bytes_and_report(s, page, p, "Poison", 658 !check_bytes_and_report(s, page, p, "Poison",
@@ -668,7 +664,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
668 check_pad_bytes(s, page, p); 664 check_pad_bytes(s, page, p);
669 } 665 }
670 666
671 if (!s->offset && active) 667 if (!s->offset && val == SLUB_RED_ACTIVE)
672 /* 668 /*
673 * Object and freepointer overlap. Cannot check 669 * Object and freepointer overlap. Cannot check
674 * freepointer while object is allocated. 670 * freepointer while object is allocated.
@@ -787,6 +783,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
787} 783}
788 784
789/* 785/*
786 * Hooks for other subsystems that check memory allocations. In a typical
787 * production configuration these hooks all should produce no code at all.
788 */
789static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
790{
791 flags &= gfp_allowed_mask;
792 lockdep_trace_alloc(flags);
793 might_sleep_if(flags & __GFP_WAIT);
794
795 return should_failslab(s->objsize, flags, s->flags);
796}
797
798static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
799{
800 flags &= gfp_allowed_mask;
801 kmemcheck_slab_alloc(s, flags, object, s->objsize);
802 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
803}
804
805static inline void slab_free_hook(struct kmem_cache *s, void *x)
806{
807 kmemleak_free_recursive(x, s->flags);
808}
809
810static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
811{
812 kmemcheck_slab_free(s, object, s->objsize);
813 debug_check_no_locks_freed(object, s->objsize);
814 if (!(s->flags & SLAB_DEBUG_OBJECTS))
815 debug_check_no_obj_freed(object, s->objsize);
816}
817
818/*
790 * Tracking of fully allocated slabs for debugging purposes. 819 * Tracking of fully allocated slabs for debugging purposes.
791 */ 820 */
792static void add_full(struct kmem_cache_node *n, struct page *page) 821static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -833,7 +862,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
833 * dilemma by deferring the increment of the count during 862 * dilemma by deferring the increment of the count during
834 * bootstrap (see early_kmem_cache_node_alloc). 863 * bootstrap (see early_kmem_cache_node_alloc).
835 */ 864 */
836 if (!NUMA_BUILD || n) { 865 if (n) {
837 atomic_long_inc(&n->nr_slabs); 866 atomic_long_inc(&n->nr_slabs);
838 atomic_long_add(objects, &n->total_objects); 867 atomic_long_add(objects, &n->total_objects);
839 } 868 }
@@ -853,11 +882,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
853 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 882 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
854 return; 883 return;
855 884
856 init_object(s, object, 0); 885 init_object(s, object, SLUB_RED_INACTIVE);
857 init_tracking(s, object); 886 init_tracking(s, object);
858} 887}
859 888
860static int alloc_debug_processing(struct kmem_cache *s, struct page *page, 889static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
861 void *object, unsigned long addr) 890 void *object, unsigned long addr)
862{ 891{
863 if (!check_slab(s, page)) 892 if (!check_slab(s, page))
@@ -873,14 +902,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
873 goto bad; 902 goto bad;
874 } 903 }
875 904
876 if (!check_object(s, page, object, 0)) 905 if (!check_object(s, page, object, SLUB_RED_INACTIVE))
877 goto bad; 906 goto bad;
878 907
879 /* Success perform special debug activities for allocs */ 908 /* Success perform special debug activities for allocs */
880 if (s->flags & SLAB_STORE_USER) 909 if (s->flags & SLAB_STORE_USER)
881 set_track(s, object, TRACK_ALLOC, addr); 910 set_track(s, object, TRACK_ALLOC, addr);
882 trace(s, page, object, 1); 911 trace(s, page, object, 1);
883 init_object(s, object, 1); 912 init_object(s, object, SLUB_RED_ACTIVE);
884 return 1; 913 return 1;
885 914
886bad: 915bad:
@@ -897,8 +926,8 @@ bad:
897 return 0; 926 return 0;
898} 927}
899 928
900static int free_debug_processing(struct kmem_cache *s, struct page *page, 929static noinline int free_debug_processing(struct kmem_cache *s,
901 void *object, unsigned long addr) 930 struct page *page, void *object, unsigned long addr)
902{ 931{
903 if (!check_slab(s, page)) 932 if (!check_slab(s, page))
904 goto fail; 933 goto fail;
@@ -913,7 +942,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
913 goto fail; 942 goto fail;
914 } 943 }
915 944
916 if (!check_object(s, page, object, 1)) 945 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
917 return 0; 946 return 0;
918 947
919 if (unlikely(s != page->slab)) { 948 if (unlikely(s != page->slab)) {
@@ -937,7 +966,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
937 if (s->flags & SLAB_STORE_USER) 966 if (s->flags & SLAB_STORE_USER)
938 set_track(s, object, TRACK_FREE, addr); 967 set_track(s, object, TRACK_FREE, addr);
939 trace(s, page, object, 0); 968 trace(s, page, object, 0);
940 init_object(s, object, 0); 969 init_object(s, object, SLUB_RED_INACTIVE);
941 return 1; 970 return 1;
942 971
943fail: 972fail:
@@ -1041,7 +1070,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
1041static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1070static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1042 { return 1; } 1071 { return 1; }
1043static inline int check_object(struct kmem_cache *s, struct page *page, 1072static inline int check_object(struct kmem_cache *s, struct page *page,
1044 void *object, int active) { return 1; } 1073 void *object, u8 val) { return 1; }
1045static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1074static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1046static inline unsigned long kmem_cache_flags(unsigned long objsize, 1075static inline unsigned long kmem_cache_flags(unsigned long objsize,
1047 unsigned long flags, const char *name, 1076 unsigned long flags, const char *name,
@@ -1061,7 +1090,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
1061 int objects) {} 1090 int objects) {}
1062static inline void dec_slabs_node(struct kmem_cache *s, int node, 1091static inline void dec_slabs_node(struct kmem_cache *s, int node,
1063 int objects) {} 1092 int objects) {}
1064#endif 1093
1094static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1095 { return 0; }
1096
1097static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1098 void *object) {}
1099
1100static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1101
1102static inline void slab_free_hook_irq(struct kmem_cache *s,
1103 void *object) {}
1104
1105#endif /* CONFIG_SLUB_DEBUG */
1065 1106
1066/* 1107/*
1067 * Slab allocation and freeing 1108 * Slab allocation and freeing
@@ -1073,7 +1114,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1073 1114
1074 flags |= __GFP_NOTRACK; 1115 flags |= __GFP_NOTRACK;
1075 1116
1076 if (node == -1) 1117 if (node == NUMA_NO_NODE)
1077 return alloc_pages(flags, order); 1118 return alloc_pages(flags, order);
1078 else 1119 else
1079 return alloc_pages_exact_node(node, flags, order); 1120 return alloc_pages_exact_node(node, flags, order);
@@ -1157,9 +1198,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1157 inc_slabs_node(s, page_to_nid(page), page->objects); 1198 inc_slabs_node(s, page_to_nid(page), page->objects);
1158 page->slab = s; 1199 page->slab = s;
1159 page->flags |= 1 << PG_slab; 1200 page->flags |= 1 << PG_slab;
1160 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
1161 SLAB_STORE_USER | SLAB_TRACE))
1162 __SetPageSlubDebug(page);
1163 1201
1164 start = page_address(page); 1202 start = page_address(page);
1165 1203
@@ -1186,14 +1224,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1186 int order = compound_order(page); 1224 int order = compound_order(page);
1187 int pages = 1 << order; 1225 int pages = 1 << order;
1188 1226
1189 if (unlikely(SLABDEBUG && PageSlubDebug(page))) { 1227 if (kmem_cache_debug(s)) {
1190 void *p; 1228 void *p;
1191 1229
1192 slab_pad_check(s, page); 1230 slab_pad_check(s, page);
1193 for_each_object(p, s, page_address(page), 1231 for_each_object(p, s, page_address(page),
1194 page->objects) 1232 page->objects)
1195 check_object(s, page, p, 0); 1233 check_object(s, page, p, SLUB_RED_INACTIVE);
1196 __ClearPageSlubDebug(page);
1197 } 1234 }
1198 1235
1199 kmemcheck_free_shadow(page, compound_order(page)); 1236 kmemcheck_free_shadow(page, compound_order(page));
@@ -1273,13 +1310,19 @@ static void add_partial(struct kmem_cache_node *n,
1273 spin_unlock(&n->list_lock); 1310 spin_unlock(&n->list_lock);
1274} 1311}
1275 1312
1313static inline void __remove_partial(struct kmem_cache_node *n,
1314 struct page *page)
1315{
1316 list_del(&page->lru);
1317 n->nr_partial--;
1318}
1319
1276static void remove_partial(struct kmem_cache *s, struct page *page) 1320static void remove_partial(struct kmem_cache *s, struct page *page)
1277{ 1321{
1278 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1322 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1279 1323
1280 spin_lock(&n->list_lock); 1324 spin_lock(&n->list_lock);
1281 list_del(&page->lru); 1325 __remove_partial(n, page);
1282 n->nr_partial--;
1283 spin_unlock(&n->list_lock); 1326 spin_unlock(&n->list_lock);
1284} 1327}
1285 1328
@@ -1292,8 +1335,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1292 struct page *page) 1335 struct page *page)
1293{ 1336{
1294 if (slab_trylock(page)) { 1337 if (slab_trylock(page)) {
1295 list_del(&page->lru); 1338 __remove_partial(n, page);
1296 n->nr_partial--;
1297 __SetPageSlubFrozen(page); 1339 __SetPageSlubFrozen(page);
1298 return 1; 1340 return 1;
1299 } 1341 }
@@ -1387,10 +1429,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1387static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) 1429static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1388{ 1430{
1389 struct page *page; 1431 struct page *page;
1390 int searchnode = (node == -1) ? numa_node_id() : node; 1432 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1391 1433
1392 page = get_partial_node(get_node(s, searchnode)); 1434 page = get_partial_node(get_node(s, searchnode));
1393 if (page || (flags & __GFP_THISNODE)) 1435 if (page || node != -1)
1394 return page; 1436 return page;
1395 1437
1396 return get_any_partial(s, flags); 1438 return get_any_partial(s, flags);
@@ -1404,6 +1446,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1404 * On exit the slab lock will have been dropped. 1446 * On exit the slab lock will have been dropped.
1405 */ 1447 */
1406static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1448static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1449 __releases(bitlock)
1407{ 1450{
1408 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1451 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1409 1452
@@ -1415,8 +1458,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1415 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1458 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1416 } else { 1459 } else {
1417 stat(s, DEACTIVATE_FULL); 1460 stat(s, DEACTIVATE_FULL);
1418 if (SLABDEBUG && PageSlubDebug(page) && 1461 if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1419 (s->flags & SLAB_STORE_USER))
1420 add_full(n, page); 1462 add_full(n, page);
1421 } 1463 }
1422 slab_unlock(page); 1464 slab_unlock(page);
@@ -1447,6 +1489,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1447 * Remove the cpu slab 1489 * Remove the cpu slab
1448 */ 1490 */
1449static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1491static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1492 __releases(bitlock)
1450{ 1493{
1451 struct page *page = c->page; 1494 struct page *page = c->page;
1452 int tail = 1; 1495 int tail = 1;
@@ -1515,7 +1558,7 @@ static void flush_all(struct kmem_cache *s)
1515static inline int node_match(struct kmem_cache_cpu *c, int node) 1558static inline int node_match(struct kmem_cache_cpu *c, int node)
1516{ 1559{
1517#ifdef CONFIG_NUMA 1560#ifdef CONFIG_NUMA
1518 if (node != -1 && c->node != node) 1561 if (node != NUMA_NO_NODE && c->node != node)
1519 return 0; 1562 return 0;
1520#endif 1563#endif
1521 return 1; 1564 return 1;
@@ -1624,7 +1667,7 @@ load_freelist:
1624 object = c->page->freelist; 1667 object = c->page->freelist;
1625 if (unlikely(!object)) 1668 if (unlikely(!object))
1626 goto another_slab; 1669 goto another_slab;
1627 if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) 1670 if (kmem_cache_debug(s))
1628 goto debug; 1671 goto debug;
1629 1672
1630 c->freelist = get_freepointer(s, object); 1673 c->freelist = get_freepointer(s, object);
@@ -1647,6 +1690,7 @@ new_slab:
1647 goto load_freelist; 1690 goto load_freelist;
1648 } 1691 }
1649 1692
1693 gfpflags &= gfp_allowed_mask;
1650 if (gfpflags & __GFP_WAIT) 1694 if (gfpflags & __GFP_WAIT)
1651 local_irq_enable(); 1695 local_irq_enable();
1652 1696
@@ -1674,7 +1718,7 @@ debug:
1674 1718
1675 c->page->inuse++; 1719 c->page->inuse++;
1676 c->page->freelist = get_freepointer(s, object); 1720 c->page->freelist = get_freepointer(s, object);
1677 c->node = -1; 1721 c->node = NUMA_NO_NODE;
1678 goto unlock_out; 1722 goto unlock_out;
1679} 1723}
1680 1724
@@ -1695,12 +1739,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1695 struct kmem_cache_cpu *c; 1739 struct kmem_cache_cpu *c;
1696 unsigned long flags; 1740 unsigned long flags;
1697 1741
1698 gfpflags &= gfp_allowed_mask; 1742 if (slab_pre_alloc_hook(s, gfpflags))
1699
1700 lockdep_trace_alloc(gfpflags);
1701 might_sleep_if(gfpflags & __GFP_WAIT);
1702
1703 if (should_failslab(s->objsize, gfpflags, s->flags))
1704 return NULL; 1743 return NULL;
1705 1744
1706 local_irq_save(flags); 1745 local_irq_save(flags);
@@ -1719,15 +1758,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1719 if (unlikely(gfpflags & __GFP_ZERO) && object) 1758 if (unlikely(gfpflags & __GFP_ZERO) && object)
1720 memset(object, 0, s->objsize); 1759 memset(object, 0, s->objsize);
1721 1760
1722 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); 1761 slab_post_alloc_hook(s, gfpflags, object);
1723 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1724 1762
1725 return object; 1763 return object;
1726} 1764}
1727 1765
1728void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1766void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1729{ 1767{
1730 void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); 1768 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1731 1769
1732 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); 1770 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
1733 1771
@@ -1738,7 +1776,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
1738#ifdef CONFIG_TRACING 1776#ifdef CONFIG_TRACING
1739void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1777void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1740{ 1778{
1741 return slab_alloc(s, gfpflags, -1, _RET_IP_); 1779 return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1742} 1780}
1743EXPORT_SYMBOL(kmem_cache_alloc_notrace); 1781EXPORT_SYMBOL(kmem_cache_alloc_notrace);
1744#endif 1782#endif
@@ -1754,7 +1792,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1754 return ret; 1792 return ret;
1755} 1793}
1756EXPORT_SYMBOL(kmem_cache_alloc_node); 1794EXPORT_SYMBOL(kmem_cache_alloc_node);
1757#endif
1758 1795
1759#ifdef CONFIG_TRACING 1796#ifdef CONFIG_TRACING
1760void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1797void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
@@ -1765,6 +1802,7 @@ void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1765} 1802}
1766EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 1803EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1767#endif 1804#endif
1805#endif
1768 1806
1769/* 1807/*
1770 * Slow patch handling. This may still be called frequently since objects 1808 * Slow patch handling. This may still be called frequently since objects
@@ -1783,7 +1821,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1783 stat(s, FREE_SLOWPATH); 1821 stat(s, FREE_SLOWPATH);
1784 slab_lock(page); 1822 slab_lock(page);
1785 1823
1786 if (unlikely(SLABDEBUG && PageSlubDebug(page))) 1824 if (kmem_cache_debug(s))
1787 goto debug; 1825 goto debug;
1788 1826
1789checks_ok: 1827checks_ok:
@@ -1850,14 +1888,14 @@ static __always_inline void slab_free(struct kmem_cache *s,
1850 struct kmem_cache_cpu *c; 1888 struct kmem_cache_cpu *c;
1851 unsigned long flags; 1889 unsigned long flags;
1852 1890
1853 kmemleak_free_recursive(x, s->flags); 1891 slab_free_hook(s, x);
1892
1854 local_irq_save(flags); 1893 local_irq_save(flags);
1855 c = __this_cpu_ptr(s->cpu_slab); 1894 c = __this_cpu_ptr(s->cpu_slab);
1856 kmemcheck_slab_free(s, object, s->objsize); 1895
1857 debug_check_no_locks_freed(object, s->objsize); 1896 slab_free_hook_irq(s, x);
1858 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1897
1859 debug_check_no_obj_freed(object, s->objsize); 1898 if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1860 if (likely(page == c->page && c->node >= 0)) {
1861 set_freepointer(s, object, c->freelist); 1899 set_freepointer(s, object, c->freelist);
1862 c->freelist = object; 1900 c->freelist = object;
1863 stat(s, FREE_FASTPATH); 1901 stat(s, FREE_FASTPATH);
@@ -2062,26 +2100,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2062#endif 2100#endif
2063} 2101}
2064 2102
2065static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); 2103static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2066
2067static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2068{ 2104{
2069 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) 2105 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2070 /* 2106 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2071 * Boot time creation of the kmalloc array. Use static per cpu data
2072 * since the per cpu allocator is not available yet.
2073 */
2074 s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
2075 else
2076 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2077 2107
2078 if (!s->cpu_slab) 2108 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2079 return 0;
2080 2109
2081 return 1; 2110 return s->cpu_slab != NULL;
2082} 2111}
2083 2112
2084#ifdef CONFIG_NUMA 2113static struct kmem_cache *kmem_cache_node;
2114
2085/* 2115/*
2086 * No kmalloc_node yet so do it by hand. We know that this is the first 2116 * No kmalloc_node yet so do it by hand. We know that this is the first
2087 * slab on the node for this slabcache. There are no concurrent accesses 2117 * slab on the node for this slabcache. There are no concurrent accesses
@@ -2091,15 +2121,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2091 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2121 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2092 * memory on a fresh node that has no slab structures yet. 2122 * memory on a fresh node that has no slab structures yet.
2093 */ 2123 */
2094static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) 2124static void early_kmem_cache_node_alloc(int node)
2095{ 2125{
2096 struct page *page; 2126 struct page *page;
2097 struct kmem_cache_node *n; 2127 struct kmem_cache_node *n;
2098 unsigned long flags; 2128 unsigned long flags;
2099 2129
2100 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 2130 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2101 2131
2102 page = new_slab(kmalloc_caches, gfpflags, node); 2132 page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2103 2133
2104 BUG_ON(!page); 2134 BUG_ON(!page);
2105 if (page_to_nid(page) != node) { 2135 if (page_to_nid(page) != node) {
@@ -2111,15 +2141,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
2111 2141
2112 n = page->freelist; 2142 n = page->freelist;
2113 BUG_ON(!n); 2143 BUG_ON(!n);
2114 page->freelist = get_freepointer(kmalloc_caches, n); 2144 page->freelist = get_freepointer(kmem_cache_node, n);
2115 page->inuse++; 2145 page->inuse++;
2116 kmalloc_caches->node[node] = n; 2146 kmem_cache_node->node[node] = n;
2117#ifdef CONFIG_SLUB_DEBUG 2147#ifdef CONFIG_SLUB_DEBUG
2118 init_object(kmalloc_caches, n, 1); 2148 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2119 init_tracking(kmalloc_caches, n); 2149 init_tracking(kmem_cache_node, n);
2120#endif 2150#endif
2121 init_kmem_cache_node(n, kmalloc_caches); 2151 init_kmem_cache_node(n, kmem_cache_node);
2122 inc_slabs_node(kmalloc_caches, node, page->objects); 2152 inc_slabs_node(kmem_cache_node, node, page->objects);
2123 2153
2124 /* 2154 /*
2125 * lockdep requires consistent irq usage for each lock 2155 * lockdep requires consistent irq usage for each lock
@@ -2137,13 +2167,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
2137 2167
2138 for_each_node_state(node, N_NORMAL_MEMORY) { 2168 for_each_node_state(node, N_NORMAL_MEMORY) {
2139 struct kmem_cache_node *n = s->node[node]; 2169 struct kmem_cache_node *n = s->node[node];
2170
2140 if (n) 2171 if (n)
2141 kmem_cache_free(kmalloc_caches, n); 2172 kmem_cache_free(kmem_cache_node, n);
2173
2142 s->node[node] = NULL; 2174 s->node[node] = NULL;
2143 } 2175 }
2144} 2176}
2145 2177
2146static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2178static int init_kmem_cache_nodes(struct kmem_cache *s)
2147{ 2179{
2148 int node; 2180 int node;
2149 2181
@@ -2151,11 +2183,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2151 struct kmem_cache_node *n; 2183 struct kmem_cache_node *n;
2152 2184
2153 if (slab_state == DOWN) { 2185 if (slab_state == DOWN) {
2154 early_kmem_cache_node_alloc(gfpflags, node); 2186 early_kmem_cache_node_alloc(node);
2155 continue; 2187 continue;
2156 } 2188 }
2157 n = kmem_cache_alloc_node(kmalloc_caches, 2189 n = kmem_cache_alloc_node(kmem_cache_node,
2158 gfpflags, node); 2190 GFP_KERNEL, node);
2159 2191
2160 if (!n) { 2192 if (!n) {
2161 free_kmem_cache_nodes(s); 2193 free_kmem_cache_nodes(s);
@@ -2167,17 +2199,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2167 } 2199 }
2168 return 1; 2200 return 1;
2169} 2201}
2170#else
2171static void free_kmem_cache_nodes(struct kmem_cache *s)
2172{
2173}
2174
2175static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2176{
2177 init_kmem_cache_node(&s->local_node, s);
2178 return 1;
2179}
2180#endif
2181 2202
2182static void set_min_partial(struct kmem_cache *s, unsigned long min) 2203static void set_min_partial(struct kmem_cache *s, unsigned long min)
2183{ 2204{
@@ -2312,7 +2333,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2312 2333
2313} 2334}
2314 2335
2315static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2336static int kmem_cache_open(struct kmem_cache *s,
2316 const char *name, size_t size, 2337 const char *name, size_t size,
2317 size_t align, unsigned long flags, 2338 size_t align, unsigned long flags,
2318 void (*ctor)(void *)) 2339 void (*ctor)(void *))
@@ -2348,10 +2369,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2348#ifdef CONFIG_NUMA 2369#ifdef CONFIG_NUMA
2349 s->remote_node_defrag_ratio = 1000; 2370 s->remote_node_defrag_ratio = 1000;
2350#endif 2371#endif
2351 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2372 if (!init_kmem_cache_nodes(s))
2352 goto error; 2373 goto error;
2353 2374
2354 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2375 if (alloc_kmem_cache_cpus(s))
2355 return 1; 2376 return 1;
2356 2377
2357 free_kmem_cache_nodes(s); 2378 free_kmem_cache_nodes(s);
@@ -2414,9 +2435,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2414#ifdef CONFIG_SLUB_DEBUG 2435#ifdef CONFIG_SLUB_DEBUG
2415 void *addr = page_address(page); 2436 void *addr = page_address(page);
2416 void *p; 2437 void *p;
2417 long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), 2438 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
2418 GFP_ATOMIC); 2439 sizeof(long), GFP_ATOMIC);
2419
2420 if (!map) 2440 if (!map)
2421 return; 2441 return;
2422 slab_err(s, page, "%s", text); 2442 slab_err(s, page, "%s", text);
@@ -2448,9 +2468,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2448 spin_lock_irqsave(&n->list_lock, flags); 2468 spin_lock_irqsave(&n->list_lock, flags);
2449 list_for_each_entry_safe(page, h, &n->partial, lru) { 2469 list_for_each_entry_safe(page, h, &n->partial, lru) {
2450 if (!page->inuse) { 2470 if (!page->inuse) {
2451 list_del(&page->lru); 2471 __remove_partial(n, page);
2452 discard_slab(s, page); 2472 discard_slab(s, page);
2453 n->nr_partial--;
2454 } else { 2473 } else {
2455 list_slab_objects(s, page, 2474 list_slab_objects(s, page,
2456 "Objects remaining on kmem_cache_close()"); 2475 "Objects remaining on kmem_cache_close()");
@@ -2490,7 +2509,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
2490 s->refcount--; 2509 s->refcount--;
2491 if (!s->refcount) { 2510 if (!s->refcount) {
2492 list_del(&s->list); 2511 list_del(&s->list);
2493 up_write(&slub_lock);
2494 if (kmem_cache_close(s)) { 2512 if (kmem_cache_close(s)) {
2495 printk(KERN_ERR "SLUB %s: %s called for cache that " 2513 printk(KERN_ERR "SLUB %s: %s called for cache that "
2496 "still has objects.\n", s->name, __func__); 2514 "still has objects.\n", s->name, __func__);
@@ -2499,8 +2517,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2499 if (s->flags & SLAB_DESTROY_BY_RCU) 2517 if (s->flags & SLAB_DESTROY_BY_RCU)
2500 rcu_barrier(); 2518 rcu_barrier();
2501 sysfs_slab_remove(s); 2519 sysfs_slab_remove(s);
2502 } else 2520 }
2503 up_write(&slub_lock); 2521 up_write(&slub_lock);
2504} 2522}
2505EXPORT_SYMBOL(kmem_cache_destroy); 2523EXPORT_SYMBOL(kmem_cache_destroy);
2506 2524
@@ -2508,9 +2526,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2508 * Kmalloc subsystem 2526 * Kmalloc subsystem
2509 *******************************************************************/ 2527 *******************************************************************/
2510 2528
2511struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; 2529struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
2512EXPORT_SYMBOL(kmalloc_caches); 2530EXPORT_SYMBOL(kmalloc_caches);
2513 2531
2532static struct kmem_cache *kmem_cache;
2533
2534#ifdef CONFIG_ZONE_DMA
2535static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
2536#endif
2537
2514static int __init setup_slub_min_order(char *str) 2538static int __init setup_slub_min_order(char *str)
2515{ 2539{
2516 get_option(&str, &slub_min_order); 2540 get_option(&str, &slub_min_order);
@@ -2547,116 +2571,29 @@ static int __init setup_slub_nomerge(char *str)
2547 2571
2548__setup("slub_nomerge", setup_slub_nomerge); 2572__setup("slub_nomerge", setup_slub_nomerge);
2549 2573
2550static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2574static struct kmem_cache *__init create_kmalloc_cache(const char *name,
2551 const char *name, int size, gfp_t gfp_flags) 2575 int size, unsigned int flags)
2552{ 2576{
2553 unsigned int flags = 0; 2577 struct kmem_cache *s;
2554 2578
2555 if (gfp_flags & SLUB_DMA) 2579 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
2556 flags = SLAB_CACHE_DMA;
2557 2580
2558 /* 2581 /*
2559 * This function is called with IRQs disabled during early-boot on 2582 * This function is called with IRQs disabled during early-boot on
2560 * single CPU so there's no need to take slub_lock here. 2583 * single CPU so there's no need to take slub_lock here.
2561 */ 2584 */
2562 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2585 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
2563 flags, NULL)) 2586 flags, NULL))
2564 goto panic; 2587 goto panic;
2565 2588
2566 list_add(&s->list, &slab_caches); 2589 list_add(&s->list, &slab_caches);
2567
2568 if (sysfs_slab_add(s))
2569 goto panic;
2570 return s; 2590 return s;
2571 2591
2572panic: 2592panic:
2573 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2593 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2594 return NULL;
2574} 2595}
2575 2596
2576#ifdef CONFIG_ZONE_DMA
2577static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
2578
2579static void sysfs_add_func(struct work_struct *w)
2580{
2581 struct kmem_cache *s;
2582
2583 down_write(&slub_lock);
2584 list_for_each_entry(s, &slab_caches, list) {
2585 if (s->flags & __SYSFS_ADD_DEFERRED) {
2586 s->flags &= ~__SYSFS_ADD_DEFERRED;
2587 sysfs_slab_add(s);
2588 }
2589 }
2590 up_write(&slub_lock);
2591}
2592
2593static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
2594
2595static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2596{
2597 struct kmem_cache *s;
2598 char *text;
2599 size_t realsize;
2600 unsigned long slabflags;
2601 int i;
2602
2603 s = kmalloc_caches_dma[index];
2604 if (s)
2605 return s;
2606
2607 /* Dynamically create dma cache */
2608 if (flags & __GFP_WAIT)
2609 down_write(&slub_lock);
2610 else {
2611 if (!down_write_trylock(&slub_lock))
2612 goto out;
2613 }
2614
2615 if (kmalloc_caches_dma[index])
2616 goto unlock_out;
2617
2618 realsize = kmalloc_caches[index].objsize;
2619 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2620 (unsigned int)realsize);
2621
2622 s = NULL;
2623 for (i = 0; i < KMALLOC_CACHES; i++)
2624 if (!kmalloc_caches[i].size)
2625 break;
2626
2627 BUG_ON(i >= KMALLOC_CACHES);
2628 s = kmalloc_caches + i;
2629
2630 /*
2631 * Must defer sysfs creation to a workqueue because we don't know
2632 * what context we are called from. Before sysfs comes up, we don't
2633 * need to do anything because our sysfs initcall will start by
2634 * adding all existing slabs to sysfs.
2635 */
2636 slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
2637 if (slab_state >= SYSFS)
2638 slabflags |= __SYSFS_ADD_DEFERRED;
2639
2640 if (!text || !kmem_cache_open(s, flags, text,
2641 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2642 s->size = 0;
2643 kfree(text);
2644 goto unlock_out;
2645 }
2646
2647 list_add(&s->list, &slab_caches);
2648 kmalloc_caches_dma[index] = s;
2649
2650 if (slab_state >= SYSFS)
2651 schedule_work(&sysfs_add_work);
2652
2653unlock_out:
2654 up_write(&slub_lock);
2655out:
2656 return kmalloc_caches_dma[index];
2657}
2658#endif
2659
2660/* 2597/*
2661 * Conversion table for small slabs sizes / 8 to the index in the 2598 * Conversion table for small slabs sizes / 8 to the index in the
2662 * kmalloc array. This is necessary for slabs < 192 since we have non power 2599 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2709,10 +2646,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2709 2646
2710#ifdef CONFIG_ZONE_DMA 2647#ifdef CONFIG_ZONE_DMA
2711 if (unlikely((flags & SLUB_DMA))) 2648 if (unlikely((flags & SLUB_DMA)))
2712 return dma_kmalloc_cache(index, flags); 2649 return kmalloc_dma_caches[index];
2713 2650
2714#endif 2651#endif
2715 return &kmalloc_caches[index]; 2652 return kmalloc_caches[index];
2716} 2653}
2717 2654
2718void *__kmalloc(size_t size, gfp_t flags) 2655void *__kmalloc(size_t size, gfp_t flags)
@@ -2728,7 +2665,7 @@ void *__kmalloc(size_t size, gfp_t flags)
2728 if (unlikely(ZERO_OR_NULL_PTR(s))) 2665 if (unlikely(ZERO_OR_NULL_PTR(s)))
2729 return s; 2666 return s;
2730 2667
2731 ret = slab_alloc(s, flags, -1, _RET_IP_); 2668 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
2732 2669
2733 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 2670 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
2734 2671
@@ -2736,6 +2673,7 @@ void *__kmalloc(size_t size, gfp_t flags)
2736} 2673}
2737EXPORT_SYMBOL(__kmalloc); 2674EXPORT_SYMBOL(__kmalloc);
2738 2675
2676#ifdef CONFIG_NUMA
2739static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2677static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2740{ 2678{
2741 struct page *page; 2679 struct page *page;
@@ -2750,7 +2688,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2750 return ptr; 2688 return ptr;
2751} 2689}
2752 2690
2753#ifdef CONFIG_NUMA
2754void *__kmalloc_node(size_t size, gfp_t flags, int node) 2691void *__kmalloc_node(size_t size, gfp_t flags, int node)
2755{ 2692{
2756 struct kmem_cache *s; 2693 struct kmem_cache *s;
@@ -2890,8 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2890 * may have freed the last object and be 2827 * may have freed the last object and be
2891 * waiting to release the slab. 2828 * waiting to release the slab.
2892 */ 2829 */
2893 list_del(&page->lru); 2830 __remove_partial(n, page);
2894 n->nr_partial--;
2895 slab_unlock(page); 2831 slab_unlock(page);
2896 discard_slab(s, page); 2832 discard_slab(s, page);
2897 } else { 2833 } else {
@@ -2915,7 +2851,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2915} 2851}
2916EXPORT_SYMBOL(kmem_cache_shrink); 2852EXPORT_SYMBOL(kmem_cache_shrink);
2917 2853
2918#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 2854#if defined(CONFIG_MEMORY_HOTPLUG)
2919static int slab_mem_going_offline_callback(void *arg) 2855static int slab_mem_going_offline_callback(void *arg)
2920{ 2856{
2921 struct kmem_cache *s; 2857 struct kmem_cache *s;
@@ -2957,7 +2893,7 @@ static void slab_mem_offline_callback(void *arg)
2957 BUG_ON(slabs_node(s, offline_node)); 2893 BUG_ON(slabs_node(s, offline_node));
2958 2894
2959 s->node[offline_node] = NULL; 2895 s->node[offline_node] = NULL;
2960 kmem_cache_free(kmalloc_caches, n); 2896 kmem_cache_free(kmem_cache_node, n);
2961 } 2897 }
2962 } 2898 }
2963 up_read(&slub_lock); 2899 up_read(&slub_lock);
@@ -2990,7 +2926,7 @@ static int slab_mem_going_online_callback(void *arg)
2990 * since memory is not yet available from the node that 2926 * since memory is not yet available from the node that
2991 * is brought up. 2927 * is brought up.
2992 */ 2928 */
2993 n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); 2929 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
2994 if (!n) { 2930 if (!n) {
2995 ret = -ENOMEM; 2931 ret = -ENOMEM;
2996 goto out; 2932 goto out;
@@ -3036,46 +2972,92 @@ static int slab_memory_callback(struct notifier_block *self,
3036 * Basic setup of slabs 2972 * Basic setup of slabs
3037 *******************************************************************/ 2973 *******************************************************************/
3038 2974
2975/*
2976 * Used for early kmem_cache structures that were allocated using
2977 * the page allocator
2978 */
2979
2980static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
2981{
2982 int node;
2983
2984 list_add(&s->list, &slab_caches);
2985 s->refcount = -1;
2986
2987 for_each_node_state(node, N_NORMAL_MEMORY) {
2988 struct kmem_cache_node *n = get_node(s, node);
2989 struct page *p;
2990
2991 if (n) {
2992 list_for_each_entry(p, &n->partial, lru)
2993 p->slab = s;
2994
2995#ifdef CONFIG_SLAB_DEBUG
2996 list_for_each_entry(p, &n->full, lru)
2997 p->slab = s;
2998#endif
2999 }
3000 }
3001}
3002
3039void __init kmem_cache_init(void) 3003void __init kmem_cache_init(void)
3040{ 3004{
3041 int i; 3005 int i;
3042 int caches = 0; 3006 int caches = 0;
3007 struct kmem_cache *temp_kmem_cache;
3008 int order;
3009 struct kmem_cache *temp_kmem_cache_node;
3010 unsigned long kmalloc_size;
3011
3012 kmem_size = offsetof(struct kmem_cache, node) +
3013 nr_node_ids * sizeof(struct kmem_cache_node *);
3014
3015 /* Allocate two kmem_caches from the page allocator */
3016 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3017 order = get_order(2 * kmalloc_size);
3018 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
3043 3019
3044#ifdef CONFIG_NUMA
3045 /* 3020 /*
3046 * Must first have the slab cache available for the allocations of the 3021 * Must first have the slab cache available for the allocations of the
3047 * struct kmem_cache_node's. There is special bootstrap code in 3022 * struct kmem_cache_node's. There is special bootstrap code in
3048 * kmem_cache_open for slab_state == DOWN. 3023 * kmem_cache_open for slab_state == DOWN.
3049 */ 3024 */
3050 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 3025 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3051 sizeof(struct kmem_cache_node), GFP_NOWAIT); 3026
3052 kmalloc_caches[0].refcount = -1; 3027 kmem_cache_open(kmem_cache_node, "kmem_cache_node",
3053 caches++; 3028 sizeof(struct kmem_cache_node),
3029 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3054 3030
3055 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3031 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3056#endif
3057 3032
3058 /* Able to allocate the per node structures */ 3033 /* Able to allocate the per node structures */
3059 slab_state = PARTIAL; 3034 slab_state = PARTIAL;
3060 3035
3061 /* Caches that are not of the two-to-the-power-of size */ 3036 temp_kmem_cache = kmem_cache;
3062 if (KMALLOC_MIN_SIZE <= 32) { 3037 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
3063 create_kmalloc_cache(&kmalloc_caches[1], 3038 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3064 "kmalloc-96", 96, GFP_NOWAIT); 3039 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3065 caches++; 3040 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3066 }
3067 if (KMALLOC_MIN_SIZE <= 64) {
3068 create_kmalloc_cache(&kmalloc_caches[2],
3069 "kmalloc-192", 192, GFP_NOWAIT);
3070 caches++;
3071 }
3072 3041
3073 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3042 /*
3074 create_kmalloc_cache(&kmalloc_caches[i], 3043 * Allocate kmem_cache_node properly from the kmem_cache slab.
3075 "kmalloc", 1 << i, GFP_NOWAIT); 3044 * kmem_cache_node is separately allocated so no need to
3076 caches++; 3045 * update any list pointers.
3077 } 3046 */
3047 temp_kmem_cache_node = kmem_cache_node;
3048
3049 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3050 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3051
3052 kmem_cache_bootstrap_fixup(kmem_cache_node);
3078 3053
3054 caches++;
3055 kmem_cache_bootstrap_fixup(kmem_cache);
3056 caches++;
3057 /* Free temporary boot structure */
3058 free_pages((unsigned long)temp_kmem_cache, order);
3059
3060 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3079 3061
3080 /* 3062 /*
3081 * Patch up the size_index table if we have strange large alignment 3063 * Patch up the size_index table if we have strange large alignment
@@ -3115,23 +3097,60 @@ void __init kmem_cache_init(void)
3115 size_index[size_index_elem(i)] = 8; 3097 size_index[size_index_elem(i)] = 8;
3116 } 3098 }
3117 3099
3100 /* Caches that are not of the two-to-the-power-of size */
3101 if (KMALLOC_MIN_SIZE <= 32) {
3102 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
3103 caches++;
3104 }
3105
3106 if (KMALLOC_MIN_SIZE <= 64) {
3107 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
3108 caches++;
3109 }
3110
3111 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3112 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
3113 caches++;
3114 }
3115
3118 slab_state = UP; 3116 slab_state = UP;
3119 3117
3120 /* Provide the correct kmalloc names now that the caches are up */ 3118 /* Provide the correct kmalloc names now that the caches are up */
3121 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) 3119 if (KMALLOC_MIN_SIZE <= 32) {
3122 kmalloc_caches[i]. name = 3120 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
3123 kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3121 BUG_ON(!kmalloc_caches[1]->name);
3122 }
3123
3124 if (KMALLOC_MIN_SIZE <= 64) {
3125 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
3126 BUG_ON(!kmalloc_caches[2]->name);
3127 }
3128
3129 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3130 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3131
3132 BUG_ON(!s);
3133 kmalloc_caches[i]->name = s;
3134 }
3124 3135
3125#ifdef CONFIG_SMP 3136#ifdef CONFIG_SMP
3126 register_cpu_notifier(&slab_notifier); 3137 register_cpu_notifier(&slab_notifier);
3127#endif 3138#endif
3128#ifdef CONFIG_NUMA
3129 kmem_size = offsetof(struct kmem_cache, node) +
3130 nr_node_ids * sizeof(struct kmem_cache_node *);
3131#else
3132 kmem_size = sizeof(struct kmem_cache);
3133#endif
3134 3139
3140#ifdef CONFIG_ZONE_DMA
3141 for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
3142 struct kmem_cache *s = kmalloc_caches[i];
3143
3144 if (s && s->size) {
3145 char *name = kasprintf(GFP_NOWAIT,
3146 "dma-kmalloc-%d", s->objsize);
3147
3148 BUG_ON(!name);
3149 kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3150 s->objsize, SLAB_CACHE_DMA);
3151 }
3152 }
3153#endif
3135 printk(KERN_INFO 3154 printk(KERN_INFO
3136 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3155 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
3137 " CPUs=%d, Nodes=%d\n", 3156 " CPUs=%d, Nodes=%d\n",
@@ -3209,6 +3228,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3209 size_t align, unsigned long flags, void (*ctor)(void *)) 3228 size_t align, unsigned long flags, void (*ctor)(void *))
3210{ 3229{
3211 struct kmem_cache *s; 3230 struct kmem_cache *s;
3231 char *n;
3212 3232
3213 if (WARN_ON(!name)) 3233 if (WARN_ON(!name))
3214 return NULL; 3234 return NULL;
@@ -3223,32 +3243,34 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3223 */ 3243 */
3224 s->objsize = max(s->objsize, (int)size); 3244 s->objsize = max(s->objsize, (int)size);
3225 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3245 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3226 up_write(&slub_lock);
3227 3246
3228 if (sysfs_slab_alias(s, name)) { 3247 if (sysfs_slab_alias(s, name)) {
3229 down_write(&slub_lock);
3230 s->refcount--; 3248 s->refcount--;
3231 up_write(&slub_lock);
3232 goto err; 3249 goto err;
3233 } 3250 }
3251 up_write(&slub_lock);
3234 return s; 3252 return s;
3235 } 3253 }
3236 3254
3255 n = kstrdup(name, GFP_KERNEL);
3256 if (!n)
3257 goto err;
3258
3237 s = kmalloc(kmem_size, GFP_KERNEL); 3259 s = kmalloc(kmem_size, GFP_KERNEL);
3238 if (s) { 3260 if (s) {
3239 if (kmem_cache_open(s, GFP_KERNEL, name, 3261 if (kmem_cache_open(s, n,
3240 size, align, flags, ctor)) { 3262 size, align, flags, ctor)) {
3241 list_add(&s->list, &slab_caches); 3263 list_add(&s->list, &slab_caches);
3242 up_write(&slub_lock);
3243 if (sysfs_slab_add(s)) { 3264 if (sysfs_slab_add(s)) {
3244 down_write(&slub_lock);
3245 list_del(&s->list); 3265 list_del(&s->list);
3246 up_write(&slub_lock); 3266 kfree(n);
3247 kfree(s); 3267 kfree(s);
3248 goto err; 3268 goto err;
3249 } 3269 }
3270 up_write(&slub_lock);
3250 return s; 3271 return s;
3251 } 3272 }
3273 kfree(n);
3252 kfree(s); 3274 kfree(s);
3253 } 3275 }
3254 up_write(&slub_lock); 3276 up_write(&slub_lock);
@@ -3312,7 +3334,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3312 if (unlikely(ZERO_OR_NULL_PTR(s))) 3334 if (unlikely(ZERO_OR_NULL_PTR(s)))
3313 return s; 3335 return s;
3314 3336
3315 ret = slab_alloc(s, gfpflags, -1, caller); 3337 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
3316 3338
3317 /* Honor the call site pointer we recieved. */ 3339 /* Honor the call site pointer we recieved. */
3318 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3340 trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -3320,6 +3342,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3320 return ret; 3342 return ret;
3321} 3343}
3322 3344
3345#ifdef CONFIG_NUMA
3323void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3346void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3324 int node, unsigned long caller) 3347 int node, unsigned long caller)
3325{ 3348{
@@ -3348,8 +3371,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3348 3371
3349 return ret; 3372 return ret;
3350} 3373}
3374#endif
3351 3375
3352#ifdef CONFIG_SLUB_DEBUG 3376#ifdef CONFIG_SYSFS
3353static int count_inuse(struct page *page) 3377static int count_inuse(struct page *page)
3354{ 3378{
3355 return page->inuse; 3379 return page->inuse;
@@ -3359,7 +3383,9 @@ static int count_total(struct page *page)
3359{ 3383{
3360 return page->objects; 3384 return page->objects;
3361} 3385}
3386#endif
3362 3387
3388#ifdef CONFIG_SLUB_DEBUG
3363static int validate_slab(struct kmem_cache *s, struct page *page, 3389static int validate_slab(struct kmem_cache *s, struct page *page,
3364 unsigned long *map) 3390 unsigned long *map)
3365{ 3391{
@@ -3395,16 +3421,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3395 } else 3421 } else
3396 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", 3422 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3397 s->name, page); 3423 s->name, page);
3398
3399 if (s->flags & DEBUG_DEFAULT_FLAGS) {
3400 if (!PageSlubDebug(page))
3401 printk(KERN_ERR "SLUB %s: SlubDebug not set "
3402 "on slab 0x%p\n", s->name, page);
3403 } else {
3404 if (PageSlubDebug(page))
3405 printk(KERN_ERR "SLUB %s: SlubDebug set on "
3406 "slab 0x%p\n", s->name, page);
3407 }
3408} 3424}
3409 3425
3410static int validate_slab_node(struct kmem_cache *s, 3426static int validate_slab_node(struct kmem_cache *s,
@@ -3460,65 +3476,6 @@ static long validate_slab_cache(struct kmem_cache *s)
3460 kfree(map); 3476 kfree(map);
3461 return count; 3477 return count;
3462} 3478}
3463
3464#ifdef SLUB_RESILIENCY_TEST
3465static void resiliency_test(void)
3466{
3467 u8 *p;
3468
3469 printk(KERN_ERR "SLUB resiliency testing\n");
3470 printk(KERN_ERR "-----------------------\n");
3471 printk(KERN_ERR "A. Corruption after allocation\n");
3472
3473 p = kzalloc(16, GFP_KERNEL);
3474 p[16] = 0x12;
3475 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3476 " 0x12->0x%p\n\n", p + 16);
3477
3478 validate_slab_cache(kmalloc_caches + 4);
3479
3480 /* Hmmm... The next two are dangerous */
3481 p = kzalloc(32, GFP_KERNEL);
3482 p[32 + sizeof(void *)] = 0x34;
3483 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3484 " 0x34 -> -0x%p\n", p);
3485 printk(KERN_ERR
3486 "If allocated object is overwritten then not detectable\n\n");
3487
3488 validate_slab_cache(kmalloc_caches + 5);
3489 p = kzalloc(64, GFP_KERNEL);
3490 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3491 *p = 0x56;
3492 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3493 p);
3494 printk(KERN_ERR
3495 "If allocated object is overwritten then not detectable\n\n");
3496 validate_slab_cache(kmalloc_caches + 6);
3497
3498 printk(KERN_ERR "\nB. Corruption after free\n");
3499 p = kzalloc(128, GFP_KERNEL);
3500 kfree(p);
3501 *p = 0x78;
3502 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3503 validate_slab_cache(kmalloc_caches + 7);
3504
3505 p = kzalloc(256, GFP_KERNEL);
3506 kfree(p);
3507 p[50] = 0x9a;
3508 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3509 p);
3510 validate_slab_cache(kmalloc_caches + 8);
3511
3512 p = kzalloc(512, GFP_KERNEL);
3513 kfree(p);
3514 p[512] = 0xab;
3515 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3516 validate_slab_cache(kmalloc_caches + 9);
3517}
3518#else
3519static void resiliency_test(void) {};
3520#endif
3521
3522/* 3479/*
3523 * Generate lists of code addresses where slabcache objects are allocated 3480 * Generate lists of code addresses where slabcache objects are allocated
3524 * and freed. 3481 * and freed.
@@ -3647,7 +3604,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3647 3604
3648static void process_slab(struct loc_track *t, struct kmem_cache *s, 3605static void process_slab(struct loc_track *t, struct kmem_cache *s,
3649 struct page *page, enum track_item alloc, 3606 struct page *page, enum track_item alloc,
3650 long *map) 3607 unsigned long *map)
3651{ 3608{
3652 void *addr = page_address(page); 3609 void *addr = page_address(page);
3653 void *p; 3610 void *p;
@@ -3747,7 +3704,71 @@ static int list_locations(struct kmem_cache *s, char *buf,
3747 len += sprintf(buf, "No data\n"); 3704 len += sprintf(buf, "No data\n");
3748 return len; 3705 return len;
3749} 3706}
3707#endif
3708
3709#ifdef SLUB_RESILIENCY_TEST
3710static void resiliency_test(void)
3711{
3712 u8 *p;
3713
3714 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
3715
3716 printk(KERN_ERR "SLUB resiliency testing\n");
3717 printk(KERN_ERR "-----------------------\n");
3718 printk(KERN_ERR "A. Corruption after allocation\n");
3719
3720 p = kzalloc(16, GFP_KERNEL);
3721 p[16] = 0x12;
3722 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3723 " 0x12->0x%p\n\n", p + 16);
3724
3725 validate_slab_cache(kmalloc_caches[4]);
3726
3727 /* Hmmm... The next two are dangerous */
3728 p = kzalloc(32, GFP_KERNEL);
3729 p[32 + sizeof(void *)] = 0x34;
3730 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3731 " 0x34 -> -0x%p\n", p);
3732 printk(KERN_ERR
3733 "If allocated object is overwritten then not detectable\n\n");
3734
3735 validate_slab_cache(kmalloc_caches[5]);
3736 p = kzalloc(64, GFP_KERNEL);
3737 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3738 *p = 0x56;
3739 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3740 p);
3741 printk(KERN_ERR
3742 "If allocated object is overwritten then not detectable\n\n");
3743 validate_slab_cache(kmalloc_caches[6]);
3744
3745 printk(KERN_ERR "\nB. Corruption after free\n");
3746 p = kzalloc(128, GFP_KERNEL);
3747 kfree(p);
3748 *p = 0x78;
3749 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3750 validate_slab_cache(kmalloc_caches[7]);
3751
3752 p = kzalloc(256, GFP_KERNEL);
3753 kfree(p);
3754 p[50] = 0x9a;
3755 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3756 p);
3757 validate_slab_cache(kmalloc_caches[8]);
3758
3759 p = kzalloc(512, GFP_KERNEL);
3760 kfree(p);
3761 p[512] = 0xab;
3762 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3763 validate_slab_cache(kmalloc_caches[9]);
3764}
3765#else
3766#ifdef CONFIG_SYSFS
3767static void resiliency_test(void) {};
3768#endif
3769#endif
3750 3770
3771#ifdef CONFIG_SYSFS
3751enum slab_stat_type { 3772enum slab_stat_type {
3752 SL_ALL, /* All slabs */ 3773 SL_ALL, /* All slabs */
3753 SL_PARTIAL, /* Only partially allocated slabs */ 3774 SL_PARTIAL, /* Only partially allocated slabs */
@@ -3800,6 +3821,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3800 } 3821 }
3801 } 3822 }
3802 3823
3824 down_read(&slub_lock);
3825#ifdef CONFIG_SLUB_DEBUG
3803 if (flags & SO_ALL) { 3826 if (flags & SO_ALL) {
3804 for_each_node_state(node, N_NORMAL_MEMORY) { 3827 for_each_node_state(node, N_NORMAL_MEMORY) {
3805 struct kmem_cache_node *n = get_node(s, node); 3828 struct kmem_cache_node *n = get_node(s, node);
@@ -3816,7 +3839,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3816 nodes[node] += x; 3839 nodes[node] += x;
3817 } 3840 }
3818 3841
3819 } else if (flags & SO_PARTIAL) { 3842 } else
3843#endif
3844 if (flags & SO_PARTIAL) {
3820 for_each_node_state(node, N_NORMAL_MEMORY) { 3845 for_each_node_state(node, N_NORMAL_MEMORY) {
3821 struct kmem_cache_node *n = get_node(s, node); 3846 struct kmem_cache_node *n = get_node(s, node);
3822 3847
@@ -3841,6 +3866,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3841 return x + sprintf(buf + x, "\n"); 3866 return x + sprintf(buf + x, "\n");
3842} 3867}
3843 3868
3869#ifdef CONFIG_SLUB_DEBUG
3844static int any_slab_objects(struct kmem_cache *s) 3870static int any_slab_objects(struct kmem_cache *s)
3845{ 3871{
3846 int node; 3872 int node;
@@ -3856,6 +3882,7 @@ static int any_slab_objects(struct kmem_cache *s)
3856 } 3882 }
3857 return 0; 3883 return 0;
3858} 3884}
3885#endif
3859 3886
3860#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 3887#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3861#define to_slab(n) container_of(n, struct kmem_cache, kobj); 3888#define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -3957,12 +3984,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3957} 3984}
3958SLAB_ATTR_RO(aliases); 3985SLAB_ATTR_RO(aliases);
3959 3986
3960static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3961{
3962 return show_slab_objects(s, buf, SO_ALL);
3963}
3964SLAB_ATTR_RO(slabs);
3965
3966static ssize_t partial_show(struct kmem_cache *s, char *buf) 3987static ssize_t partial_show(struct kmem_cache *s, char *buf)
3967{ 3988{
3968 return show_slab_objects(s, buf, SO_PARTIAL); 3989 return show_slab_objects(s, buf, SO_PARTIAL);
@@ -3987,93 +4008,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
3987} 4008}
3988SLAB_ATTR_RO(objects_partial); 4009SLAB_ATTR_RO(objects_partial);
3989 4010
3990static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4011static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3991{
3992 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
3993}
3994SLAB_ATTR_RO(total_objects);
3995
3996static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3997{ 4012{
3998 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4013 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3999} 4014}
4000 4015
4001static ssize_t sanity_checks_store(struct kmem_cache *s, 4016static ssize_t reclaim_account_store(struct kmem_cache *s,
4002 const char *buf, size_t length) 4017 const char *buf, size_t length)
4003{ 4018{
4004 s->flags &= ~SLAB_DEBUG_FREE; 4019 s->flags &= ~SLAB_RECLAIM_ACCOUNT;
4005 if (buf[0] == '1') 4020 if (buf[0] == '1')
4006 s->flags |= SLAB_DEBUG_FREE; 4021 s->flags |= SLAB_RECLAIM_ACCOUNT;
4007 return length; 4022 return length;
4008} 4023}
4009SLAB_ATTR(sanity_checks); 4024SLAB_ATTR(reclaim_account);
4010 4025
4011static ssize_t trace_show(struct kmem_cache *s, char *buf) 4026static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4012{ 4027{
4013 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4028 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4014} 4029}
4030SLAB_ATTR_RO(hwcache_align);
4015 4031
4016static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4032#ifdef CONFIG_ZONE_DMA
4017 size_t length) 4033static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4018{ 4034{
4019 s->flags &= ~SLAB_TRACE; 4035 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4020 if (buf[0] == '1')
4021 s->flags |= SLAB_TRACE;
4022 return length;
4023} 4036}
4024SLAB_ATTR(trace); 4037SLAB_ATTR_RO(cache_dma);
4038#endif
4025 4039
4026#ifdef CONFIG_FAILSLAB 4040static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4027static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4028{ 4041{
4029 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4042 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4030} 4043}
4044SLAB_ATTR_RO(destroy_by_rcu);
4031 4045
4032static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4046#ifdef CONFIG_SLUB_DEBUG
4033 size_t length) 4047static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4034{ 4048{
4035 s->flags &= ~SLAB_FAILSLAB; 4049 return show_slab_objects(s, buf, SO_ALL);
4036 if (buf[0] == '1')
4037 s->flags |= SLAB_FAILSLAB;
4038 return length;
4039} 4050}
4040SLAB_ATTR(failslab); 4051SLAB_ATTR_RO(slabs);
4041#endif
4042 4052
4043static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4053static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4044{ 4054{
4045 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4055 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4046} 4056}
4057SLAB_ATTR_RO(total_objects);
4047 4058
4048static ssize_t reclaim_account_store(struct kmem_cache *s, 4059static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4049 const char *buf, size_t length)
4050{ 4060{
4051 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4061 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4052 if (buf[0] == '1')
4053 s->flags |= SLAB_RECLAIM_ACCOUNT;
4054 return length;
4055} 4062}
4056SLAB_ATTR(reclaim_account);
4057 4063
4058static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4064static ssize_t sanity_checks_store(struct kmem_cache *s,
4065 const char *buf, size_t length)
4059{ 4066{
4060 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4067 s->flags &= ~SLAB_DEBUG_FREE;
4068 if (buf[0] == '1')
4069 s->flags |= SLAB_DEBUG_FREE;
4070 return length;
4061} 4071}
4062SLAB_ATTR_RO(hwcache_align); 4072SLAB_ATTR(sanity_checks);
4063 4073
4064#ifdef CONFIG_ZONE_DMA 4074static ssize_t trace_show(struct kmem_cache *s, char *buf)
4065static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4066{ 4075{
4067 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4076 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4068} 4077}
4069SLAB_ATTR_RO(cache_dma);
4070#endif
4071 4078
4072static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4079static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4080 size_t length)
4073{ 4081{
4074 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4082 s->flags &= ~SLAB_TRACE;
4083 if (buf[0] == '1')
4084 s->flags |= SLAB_TRACE;
4085 return length;
4075} 4086}
4076SLAB_ATTR_RO(destroy_by_rcu); 4087SLAB_ATTR(trace);
4077 4088
4078static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4089static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4079{ 4090{
@@ -4151,6 +4162,40 @@ static ssize_t validate_store(struct kmem_cache *s,
4151} 4162}
4152SLAB_ATTR(validate); 4163SLAB_ATTR(validate);
4153 4164
4165static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4166{
4167 if (!(s->flags & SLAB_STORE_USER))
4168 return -ENOSYS;
4169 return list_locations(s, buf, TRACK_ALLOC);
4170}
4171SLAB_ATTR_RO(alloc_calls);
4172
4173static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4174{
4175 if (!(s->flags & SLAB_STORE_USER))
4176 return -ENOSYS;
4177 return list_locations(s, buf, TRACK_FREE);
4178}
4179SLAB_ATTR_RO(free_calls);
4180#endif /* CONFIG_SLUB_DEBUG */
4181
4182#ifdef CONFIG_FAILSLAB
4183static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4184{
4185 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4186}
4187
4188static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4189 size_t length)
4190{
4191 s->flags &= ~SLAB_FAILSLAB;
4192 if (buf[0] == '1')
4193 s->flags |= SLAB_FAILSLAB;
4194 return length;
4195}
4196SLAB_ATTR(failslab);
4197#endif
4198
4154static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4199static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4155{ 4200{
4156 return 0; 4201 return 0;
@@ -4170,22 +4215,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
4170} 4215}
4171SLAB_ATTR(shrink); 4216SLAB_ATTR(shrink);
4172 4217
4173static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4174{
4175 if (!(s->flags & SLAB_STORE_USER))
4176 return -ENOSYS;
4177 return list_locations(s, buf, TRACK_ALLOC);
4178}
4179SLAB_ATTR_RO(alloc_calls);
4180
4181static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4182{
4183 if (!(s->flags & SLAB_STORE_USER))
4184 return -ENOSYS;
4185 return list_locations(s, buf, TRACK_FREE);
4186}
4187SLAB_ATTR_RO(free_calls);
4188
4189#ifdef CONFIG_NUMA 4218#ifdef CONFIG_NUMA
4190static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4219static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4191{ 4220{
@@ -4291,25 +4320,27 @@ static struct attribute *slab_attrs[] = {
4291 &min_partial_attr.attr, 4320 &min_partial_attr.attr,
4292 &objects_attr.attr, 4321 &objects_attr.attr,
4293 &objects_partial_attr.attr, 4322 &objects_partial_attr.attr,
4294 &total_objects_attr.attr,
4295 &slabs_attr.attr,
4296 &partial_attr.attr, 4323 &partial_attr.attr,
4297 &cpu_slabs_attr.attr, 4324 &cpu_slabs_attr.attr,
4298 &ctor_attr.attr, 4325 &ctor_attr.attr,
4299 &aliases_attr.attr, 4326 &aliases_attr.attr,
4300 &align_attr.attr, 4327 &align_attr.attr,
4301 &sanity_checks_attr.attr,
4302 &trace_attr.attr,
4303 &hwcache_align_attr.attr, 4328 &hwcache_align_attr.attr,
4304 &reclaim_account_attr.attr, 4329 &reclaim_account_attr.attr,
4305 &destroy_by_rcu_attr.attr, 4330 &destroy_by_rcu_attr.attr,
4331 &shrink_attr.attr,
4332#ifdef CONFIG_SLUB_DEBUG
4333 &total_objects_attr.attr,
4334 &slabs_attr.attr,
4335 &sanity_checks_attr.attr,
4336 &trace_attr.attr,
4306 &red_zone_attr.attr, 4337 &red_zone_attr.attr,
4307 &poison_attr.attr, 4338 &poison_attr.attr,
4308 &store_user_attr.attr, 4339 &store_user_attr.attr,
4309 &validate_attr.attr, 4340 &validate_attr.attr,
4310 &shrink_attr.attr,
4311 &alloc_calls_attr.attr, 4341 &alloc_calls_attr.attr,
4312 &free_calls_attr.attr, 4342 &free_calls_attr.attr,
4343#endif
4313#ifdef CONFIG_ZONE_DMA 4344#ifdef CONFIG_ZONE_DMA
4314 &cache_dma_attr.attr, 4345 &cache_dma_attr.attr,
4315#endif 4346#endif
@@ -4389,6 +4420,7 @@ static void kmem_cache_release(struct kobject *kobj)
4389{ 4420{
4390 struct kmem_cache *s = to_slab(kobj); 4421 struct kmem_cache *s = to_slab(kobj);
4391 4422
4423 kfree(s->name);
4392 kfree(s); 4424 kfree(s);
4393} 4425}
4394 4426
@@ -4504,6 +4536,13 @@ static int sysfs_slab_add(struct kmem_cache *s)
4504 4536
4505static void sysfs_slab_remove(struct kmem_cache *s) 4537static void sysfs_slab_remove(struct kmem_cache *s)
4506{ 4538{
4539 if (slab_state < SYSFS)
4540 /*
4541 * Sysfs has not been setup yet so no need to remove the
4542 * cache from sysfs.
4543 */
4544 return;
4545
4507 kobject_uevent(&s->kobj, KOBJ_REMOVE); 4546 kobject_uevent(&s->kobj, KOBJ_REMOVE);
4508 kobject_del(&s->kobj); 4547 kobject_del(&s->kobj);
4509 kobject_put(&s->kobj); 4548 kobject_put(&s->kobj);
@@ -4549,8 +4588,11 @@ static int __init slab_sysfs_init(void)
4549 struct kmem_cache *s; 4588 struct kmem_cache *s;
4550 int err; 4589 int err;
4551 4590
4591 down_write(&slub_lock);
4592
4552 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 4593 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
4553 if (!slab_kset) { 4594 if (!slab_kset) {
4595 up_write(&slub_lock);
4554 printk(KERN_ERR "Cannot register slab subsystem.\n"); 4596 printk(KERN_ERR "Cannot register slab subsystem.\n");
4555 return -ENOSYS; 4597 return -ENOSYS;
4556 } 4598 }
@@ -4575,12 +4617,13 @@ static int __init slab_sysfs_init(void)
4575 kfree(al); 4617 kfree(al);
4576 } 4618 }
4577 4619
4620 up_write(&slub_lock);
4578 resiliency_test(); 4621 resiliency_test();
4579 return 0; 4622 return 0;
4580} 4623}
4581 4624
4582__initcall(slab_sysfs_init); 4625__initcall(slab_sysfs_init);
4583#endif 4626#endif /* CONFIG_SYSFS */
4584 4627
4585/* 4628/*
4586 * The /proc/slabinfo ABI 4629 * The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd67fa4..29d6cbffb28 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
220 220
221 if (vmemmap_buf_start) { 221 if (vmemmap_buf_start) {
222 /* need to free left buf */ 222 /* need to free left buf */
223#ifdef CONFIG_NO_BOOTMEM
224 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
225 if (vmemmap_buf_start < vmemmap_buf) {
226 char name[15];
227
228 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
229 reserve_early_without_check(__pa(vmemmap_buf_start),
230 __pa(vmemmap_buf), name);
231 }
232#else
233 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); 223 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
234#endif
235 vmemmap_buf = NULL; 224 vmemmap_buf = NULL;
236 vmemmap_buf_end = NULL; 225 vmemmap_buf_end = NULL;
237 } 226 }
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a5..3f4854205b1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold)
378 378
379 pagevec_free(&pages_to_free); 379 pagevec_free(&pages_to_free);
380} 380}
381EXPORT_SYMBOL(release_pages);
381 382
382/* 383/*
383 * The pages which we're about to release may be in the deferred lru-addition 384 * The pages which we're about to release may be in the deferred lru-addition
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 03aa2d55f1a..67ddaaf98c7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/poll.h>
33 34
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
58 59
59static DEFINE_MUTEX(swapon_mutex); 60static DEFINE_MUTEX(swapon_mutex);
60 61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63/* Activity counter to indicate that a swapon or swapoff has occurred */
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
61static inline unsigned char swap_count(unsigned char ent) 66static inline unsigned char swap_count(unsigned char ent)
62{ 67{
63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 68 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -139,8 +144,7 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 144 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 145 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 146 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, 147 nr_blocks, GFP_KERNEL, 0);
143 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
144 if (err) 148 if (err)
145 return err; 149 return err;
146 cond_resched(); 150 cond_resched();
@@ -151,8 +155,7 @@ static int discard_swap(struct swap_info_struct *si)
151 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 155 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
152 156
153 err = blkdev_issue_discard(si->bdev, start_block, 157 err = blkdev_issue_discard(si->bdev, start_block,
154 nr_blocks, GFP_KERNEL, 158 nr_blocks, GFP_KERNEL, 0);
155 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
156 if (err) 159 if (err)
157 break; 160 break;
158 161
@@ -191,8 +194,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
191 start_block <<= PAGE_SHIFT - 9; 194 start_block <<= PAGE_SHIFT - 9;
192 nr_blocks <<= PAGE_SHIFT - 9; 195 nr_blocks <<= PAGE_SHIFT - 9;
193 if (blkdev_issue_discard(si->bdev, start_block, 196 if (blkdev_issue_discard(si->bdev, start_block,
194 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT | 197 nr_blocks, GFP_NOIO, 0))
195 BLKDEV_IFL_BARRIER))
196 break; 198 break;
197 } 199 }
198 200
@@ -686,6 +688,24 @@ int try_to_free_swap(struct page *page)
686 if (page_swapcount(page)) 688 if (page_swapcount(page))
687 return 0; 689 return 0;
688 690
691 /*
692 * Once hibernation has begun to create its image of memory,
693 * there's a danger that one of the calls to try_to_free_swap()
694 * - most probably a call from __try_to_reclaim_swap() while
695 * hibernation is allocating its own swap pages for the image,
696 * but conceivably even a call from memory reclaim - will free
697 * the swap from a page which has already been recorded in the
698 * image as a clean swapcache page, and then reuse its swap for
699 * another page of the image. On waking from hibernation, the
700 * original page might be freed under memory pressure, then
701 * later read back in from swap, now with the wrong data.
702 *
703 * Hibernation clears bits from gfp_allowed_mask to prevent
704 * memory reclaim from writing to disk, so check that here.
705 */
706 if (!(gfp_allowed_mask & __GFP_IO))
707 return 0;
708
689 delete_from_swap_cache(page); 709 delete_from_swap_cache(page);
690 SetPageDirty(page); 710 SetPageDirty(page);
691 return 1; 711 return 1;
@@ -1665,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1665 } 1685 }
1666 filp_close(swap_file, NULL); 1686 filp_close(swap_file, NULL);
1667 err = 0; 1687 err = 0;
1688 atomic_inc(&proc_poll_event);
1689 wake_up_interruptible(&proc_poll_wait);
1668 1690
1669out_dput: 1691out_dput:
1670 filp_close(victim, NULL); 1692 filp_close(victim, NULL);
@@ -1673,6 +1695,25 @@ out:
1673} 1695}
1674 1696
1675#ifdef CONFIG_PROC_FS 1697#ifdef CONFIG_PROC_FS
1698struct proc_swaps {
1699 struct seq_file seq;
1700 int event;
1701};
1702
1703static unsigned swaps_poll(struct file *file, poll_table *wait)
1704{
1705 struct proc_swaps *s = file->private_data;
1706
1707 poll_wait(file, &proc_poll_wait, wait);
1708
1709 if (s->event != atomic_read(&proc_poll_event)) {
1710 s->event = atomic_read(&proc_poll_event);
1711 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1712 }
1713
1714 return POLLIN | POLLRDNORM;
1715}
1716
1676/* iterator */ 1717/* iterator */
1677static void *swap_start(struct seq_file *swap, loff_t *pos) 1718static void *swap_start(struct seq_file *swap, loff_t *pos)
1678{ 1719{
@@ -1756,7 +1797,24 @@ static const struct seq_operations swaps_op = {
1756 1797
1757static int swaps_open(struct inode *inode, struct file *file) 1798static int swaps_open(struct inode *inode, struct file *file)
1758{ 1799{
1759 return seq_open(file, &swaps_op); 1800 struct proc_swaps *s;
1801 int ret;
1802
1803 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1804 if (!s)
1805 return -ENOMEM;
1806
1807 file->private_data = s;
1808
1809 ret = seq_open(file, &swaps_op);
1810 if (ret) {
1811 kfree(s);
1812 return ret;
1813 }
1814
1815 s->seq.private = s;
1816 s->event = atomic_read(&proc_poll_event);
1817 return ret;
1760} 1818}
1761 1819
1762static const struct file_operations proc_swaps_operations = { 1820static const struct file_operations proc_swaps_operations = {
@@ -1764,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = {
1764 .read = seq_read, 1822 .read = seq_read,
1765 .llseek = seq_lseek, 1823 .llseek = seq_lseek,
1766 .release = seq_release, 1824 .release = seq_release,
1825 .poll = swaps_poll,
1767}; 1826};
1768 1827
1769static int __init procswaps_init(void) 1828static int __init procswaps_init(void)
@@ -2032,7 +2091,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2032 p->flags |= SWP_SOLIDSTATE; 2091 p->flags |= SWP_SOLIDSTATE;
2033 p->cluster_next = 1 + (random32() % p->highest_bit); 2092 p->cluster_next = 1 + (random32() % p->highest_bit);
2034 } 2093 }
2035 if (discard_swap(p) == 0) 2094 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2036 p->flags |= SWP_DISCARDABLE; 2095 p->flags |= SWP_DISCARDABLE;
2037 } 2096 }
2038 2097
@@ -2069,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2069 swap_info[prev]->next = type; 2128 swap_info[prev]->next = type;
2070 spin_unlock(&swap_lock); 2129 spin_unlock(&swap_lock);
2071 mutex_unlock(&swapon_mutex); 2130 mutex_unlock(&swapon_mutex);
2131 atomic_inc(&proc_poll_event);
2132 wake_up_interruptible(&proc_poll_wait);
2133
2072 error = 0; 2134 error = 0;
2073 goto out; 2135 goto out;
2074bad_swap: 2136bad_swap:
diff --git a/mm/truncate.c b/mm/truncate.c
index 937571b8b23..ba887bff48c 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -541,28 +541,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
541EXPORT_SYMBOL(truncate_pagecache); 541EXPORT_SYMBOL(truncate_pagecache);
542 542
543/** 543/**
544 * truncate_setsize - update inode and pagecache for a new file size
545 * @inode: inode
546 * @newsize: new file size
547 *
548 * truncate_setsize updastes i_size update and performs pagecache
549 * truncation (if necessary) for a file size updates. It will be
550 * typically be called from the filesystem's setattr function when
551 * ATTR_SIZE is passed in.
552 *
553 * Must be called with inode_mutex held and after all filesystem
554 * specific block truncation has been performed.
555 */
556void truncate_setsize(struct inode *inode, loff_t newsize)
557{
558 loff_t oldsize;
559
560 oldsize = inode->i_size;
561 i_size_write(inode, newsize);
562
563 truncate_pagecache(inode, oldsize, newsize);
564}
565EXPORT_SYMBOL(truncate_setsize);
566
567/**
544 * vmtruncate - unmap mappings "freed" by truncate() syscall 568 * vmtruncate - unmap mappings "freed" by truncate() syscall
545 * @inode: inode of the file used 569 * @inode: inode of the file used
546 * @offset: file offset to start truncating 570 * @offset: file offset to start truncating
547 * 571 *
548 * NOTE! We have to be ready to update the memory sharing 572 * This function is deprecated and truncate_setsize or truncate_pagecache
549 * between the file and the memory map for a potential last 573 * should be used instead, together with filesystem specific block truncation.
550 * incomplete page. Ugly, but necessary.
551 *
552 * This function is deprecated and simple_setsize or truncate_pagecache
553 * should be used instead.
554 */ 574 */
555int vmtruncate(struct inode *inode, loff_t offset) 575int vmtruncate(struct inode *inode, loff_t offset)
556{ 576{
557 int error; 577 int error;
558 578
559 error = simple_setsize(inode, offset); 579 error = inode_newsize_ok(inode, offset);
560 if (error) 580 if (error)
561 return error; 581 return error;
562 582
583 truncate_setsize(inode, offset);
563 if (inode->i_op->truncate) 584 if (inode->i_op->truncate)
564 inode->i_op->truncate(inode); 585 inode->i_op->truncate(inode);
565 586 return 0;
566 return error;
567} 587}
568EXPORT_SYMBOL(vmtruncate); 588EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index f5712e8964b..73dac81e9f7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n)
225 if (length > n) 225 if (length > n)
226 return ERR_PTR(-EINVAL); 226 return ERR_PTR(-EINVAL);
227 227
228 p = kmalloc(length, GFP_KERNEL); 228 p = memdup_user(s, length);
229 229
230 if (!p) 230 if (IS_ERR(p))
231 return ERR_PTR(-ENOMEM); 231 return p;
232
233 if (copy_from_user(p, s, length)) {
234 kfree(p);
235 return ERR_PTR(-EFAULT);
236 }
237 232
238 p[length - 1] = '\0'; 233 p[length - 1] = '\0';
239 234
@@ -250,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
250} 245}
251#endif 246#endif
252 247
248/*
249 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
250 * back to the regular GUP.
251 * If the architecture not support this fucntion, simply return with no
252 * page pinned
253 */
254int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
255 int nr_pages, int write, struct page **pages)
256{
257 return 0;
258}
259EXPORT_SYMBOL_GPL(__get_user_pages_fast);
260
253/** 261/**
254 * get_user_pages_fast() - pin user pages in memory 262 * get_user_pages_fast() - pin user pages in memory
255 * @start: starting user address 263 * @start: starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae007462b7f..a3d66b3dc5c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
33 33
34bool vmap_lazy_unmap __read_mostly = true;
34 35
35/*** Page table manipulation functions ***/ 36/*** Page table manipulation functions ***/
36 37
@@ -292,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va)
292 struct rb_node *tmp; 293 struct rb_node *tmp;
293 294
294 while (*p) { 295 while (*p) {
295 struct vmap_area *tmp; 296 struct vmap_area *tmp_va;
296 297
297 parent = *p; 298 parent = *p;
298 tmp = rb_entry(parent, struct vmap_area, rb_node); 299 tmp_va = rb_entry(parent, struct vmap_area, rb_node);
299 if (va->va_start < tmp->va_end) 300 if (va->va_start < tmp_va->va_end)
300 p = &(*p)->rb_left; 301 p = &(*p)->rb_left;
301 else if (va->va_end > tmp->va_start) 302 else if (va->va_end > tmp_va->va_start)
302 p = &(*p)->rb_right; 303 p = &(*p)->rb_right;
303 else 304 else
304 BUG(); 305 BUG();
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
502{ 503{
503 unsigned int log; 504 unsigned int log;
504 505
506 if (!vmap_lazy_unmap)
507 return 0;
508
505 log = fls(num_online_cpus()); 509 log = fls(num_online_cpus());
506 510
507 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 511 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -513,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
513static void purge_fragmented_blocks_allcpus(void); 517static void purge_fragmented_blocks_allcpus(void);
514 518
515/* 519/*
520 * called before a call to iounmap() if the caller wants vm_area_struct's
521 * immediately freed.
522 */
523void set_iounmap_nonlazy(void)
524{
525 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
526}
527
528/*
516 * Purges all lazily-freed vmap areas. 529 * Purges all lazily-freed vmap areas.
517 * 530 *
518 * If sync is 0 then don't purge if there is already a purge in progress. 531 * If sync is 0 then don't purge if there is already a purge in progress.
@@ -732,7 +745,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
732 node, gfp_mask); 745 node, gfp_mask);
733 if (unlikely(IS_ERR(va))) { 746 if (unlikely(IS_ERR(va))) {
734 kfree(vb); 747 kfree(vb);
735 return ERR_PTR(PTR_ERR(va)); 748 return ERR_CAST(va);
736 } 749 }
737 750
738 err = radix_tree_preload(gfp_mask); 751 err = radix_tree_preload(gfp_mask);
@@ -1583,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1583} 1596}
1584EXPORT_SYMBOL(__vmalloc); 1597EXPORT_SYMBOL(__vmalloc);
1585 1598
1599static inline void *__vmalloc_node_flags(unsigned long size,
1600 int node, gfp_t flags)
1601{
1602 return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1603 node, __builtin_return_address(0));
1604}
1605
1586/** 1606/**
1587 * vmalloc - allocate virtually contiguous memory 1607 * vmalloc - allocate virtually contiguous memory
1588 * @size: allocation size 1608 * @size: allocation size
@@ -1594,12 +1614,28 @@ EXPORT_SYMBOL(__vmalloc);
1594 */ 1614 */
1595void *vmalloc(unsigned long size) 1615void *vmalloc(unsigned long size)
1596{ 1616{
1597 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1617 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
1598 -1, __builtin_return_address(0));
1599} 1618}
1600EXPORT_SYMBOL(vmalloc); 1619EXPORT_SYMBOL(vmalloc);
1601 1620
1602/** 1621/**
1622 * vzalloc - allocate virtually contiguous memory with zero fill
1623 * @size: allocation size
1624 * Allocate enough pages to cover @size from the page level
1625 * allocator and map them into contiguous kernel virtual space.
1626 * The memory allocated is set to zero.
1627 *
1628 * For tight control over page level allocator and protection flags
1629 * use __vmalloc() instead.
1630 */
1631void *vzalloc(unsigned long size)
1632{
1633 return __vmalloc_node_flags(size, -1,
1634 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1635}
1636EXPORT_SYMBOL(vzalloc);
1637
1638/**
1603 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 1639 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1604 * @size: allocation size 1640 * @size: allocation size
1605 * 1641 *
@@ -1640,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node)
1640} 1676}
1641EXPORT_SYMBOL(vmalloc_node); 1677EXPORT_SYMBOL(vmalloc_node);
1642 1678
1679/**
1680 * vzalloc_node - allocate memory on a specific node with zero fill
1681 * @size: allocation size
1682 * @node: numa node
1683 *
1684 * Allocate enough pages to cover @size from the page level
1685 * allocator and map them into contiguous kernel virtual space.
1686 * The memory allocated is set to zero.
1687 *
1688 * For tight control over page level allocator and protection flags
1689 * use __vmalloc_node() instead.
1690 */
1691void *vzalloc_node(unsigned long size, int node)
1692{
1693 return __vmalloc_node_flags(size, node,
1694 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1695}
1696EXPORT_SYMBOL(vzalloc_node);
1697
1643#ifndef PAGE_KERNEL_EXEC 1698#ifndef PAGE_KERNEL_EXEC
1644# define PAGE_KERNEL_EXEC PAGE_KERNEL 1699# define PAGE_KERNEL_EXEC PAGE_KERNEL
1645#endif 1700#endif
@@ -2052,6 +2107,7 @@ void free_vm_area(struct vm_struct *area)
2052} 2107}
2053EXPORT_SYMBOL_GPL(free_vm_area); 2108EXPORT_SYMBOL_GPL(free_vm_area);
2054 2109
2110#ifdef CONFIG_SMP
2055static struct vmap_area *node_to_va(struct rb_node *n) 2111static struct vmap_area *node_to_va(struct rb_node *n)
2056{ 2112{
2057 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; 2113 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2332,9 +2388,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2332 free_vm_area(vms[i]); 2388 free_vm_area(vms[i]);
2333 kfree(vms); 2389 kfree(vms);
2334} 2390}
2391#endif /* CONFIG_SMP */
2335 2392
2336#ifdef CONFIG_PROC_FS 2393#ifdef CONFIG_PROC_FS
2337static void *s_start(struct seq_file *m, loff_t *pos) 2394static void *s_start(struct seq_file *m, loff_t *pos)
2395 __acquires(&vmlist_lock)
2338{ 2396{
2339 loff_t n = *pos; 2397 loff_t n = *pos;
2340 struct vm_struct *v; 2398 struct vm_struct *v;
@@ -2361,6 +2419,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2361} 2419}
2362 2420
2363static void s_stop(struct seq_file *m, void *p) 2421static void s_stop(struct seq_file *m, void *p)
2422 __releases(&vmlist_lock)
2364{ 2423{
2365 read_unlock(&vmlist_lock); 2424 read_unlock(&vmlist_lock);
2366} 2425}
@@ -2403,7 +2462,7 @@ static int s_show(struct seq_file *m, void *p)
2403 seq_printf(m, " pages=%d", v->nr_pages); 2462 seq_printf(m, " pages=%d", v->nr_pages);
2404 2463
2405 if (v->phys_addr) 2464 if (v->phys_addr)
2406 seq_printf(m, " phys=%lx", v->phys_addr); 2465 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
2407 2466
2408 if (v->flags & VM_IOREMAP) 2467 if (v->flags & VM_IOREMAP)
2409 seq_printf(m, " ioremap"); 2468 seq_printf(m, " ioremap");
@@ -2437,8 +2496,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2437 unsigned int *ptr = NULL; 2496 unsigned int *ptr = NULL;
2438 int ret; 2497 int ret;
2439 2498
2440 if (NUMA_BUILD) 2499 if (NUMA_BUILD) {
2441 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2500 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2501 if (ptr == NULL)
2502 return -ENOMEM;
2503 }
2442 ret = seq_open(file, &vmalloc_op); 2504 ret = seq_open(file, &vmalloc_op);
2443 if (!ret) { 2505 if (!ret) {
2444 struct seq_file *m = file->private_data; 2506 struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da4..b8a6fdc2131 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,15 @@
48 48
49#include "internal.h" 49#include "internal.h"
50 50
51#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h>
53
54enum lumpy_mode {
55 LUMPY_MODE_NONE,
56 LUMPY_MODE_ASYNC,
57 LUMPY_MODE_SYNC,
58};
59
51struct scan_control { 60struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 61 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 62 unsigned long nr_scanned;
@@ -76,10 +85,10 @@ struct scan_control {
76 int order; 85 int order;
77 86
78 /* 87 /*
79 * Intend to reclaim enough contenious memory rather than to reclaim 88 * Intend to reclaim enough continuous memory rather than reclaim
80 * enough amount memory. I.e, it's the mode for high order allocation. 89 * enough amount of memory. i.e, mode for high order allocation.
81 */ 90 */
82 bool lumpy_reclaim_mode; 91 enum lumpy_mode lumpy_reclaim_mode;
83 92
84 /* Which cgroup do we reclaim from */ 93 /* Which cgroup do we reclaim from */
85 struct mem_cgroup *mem_cgroup; 94 struct mem_cgroup *mem_cgroup;
@@ -262,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
262 return ret; 271 return ret;
263} 272}
264 273
274static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
275 bool sync)
276{
277 enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
278
279 /*
280 * Some reclaim have alredy been failed. No worth to try synchronous
281 * lumpy reclaim.
282 */
283 if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
284 return;
285
286 /*
287 * If we need a large contiguous chunk of memory, or have
288 * trouble getting a small set of contiguous pages, we
289 * will reclaim both active and inactive pages.
290 */
291 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
292 sc->lumpy_reclaim_mode = mode;
293 else if (sc->order && priority < DEF_PRIORITY - 2)
294 sc->lumpy_reclaim_mode = mode;
295 else
296 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
297}
298
299static void disable_lumpy_reclaim_mode(struct scan_control *sc)
300{
301 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
302}
303
265static inline int is_page_cache_freeable(struct page *page) 304static inline int is_page_cache_freeable(struct page *page)
266{ 305{
267 /* 306 /*
@@ -272,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page)
272 return page_count(page) - page_has_private(page) == 2; 311 return page_count(page) - page_has_private(page) == 2;
273} 312}
274 313
275static int may_write_to_queue(struct backing_dev_info *bdi) 314static int may_write_to_queue(struct backing_dev_info *bdi,
315 struct scan_control *sc)
276{ 316{
277 if (current->flags & PF_SWAPWRITE) 317 if (current->flags & PF_SWAPWRITE)
278 return 1; 318 return 1;
@@ -280,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
280 return 1; 320 return 1;
281 if (bdi == current->backing_dev_info) 321 if (bdi == current->backing_dev_info)
282 return 1; 322 return 1;
323
324 /* lumpy reclaim for hugepage often need a lot of write */
325 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
326 return 1;
283 return 0; 327 return 0;
284} 328}
285 329
@@ -304,12 +348,6 @@ static void handle_write_error(struct address_space *mapping,
304 unlock_page(page); 348 unlock_page(page);
305} 349}
306 350
307/* Request for sync pageout. */
308enum pageout_io {
309 PAGEOUT_IO_ASYNC,
310 PAGEOUT_IO_SYNC,
311};
312
313/* possible outcome of pageout() */ 351/* possible outcome of pageout() */
314typedef enum { 352typedef enum {
315 /* failed to write page out, page is locked */ 353 /* failed to write page out, page is locked */
@@ -327,7 +365,7 @@ typedef enum {
327 * Calls ->writepage(). 365 * Calls ->writepage().
328 */ 366 */
329static pageout_t pageout(struct page *page, struct address_space *mapping, 367static pageout_t pageout(struct page *page, struct address_space *mapping,
330 enum pageout_io sync_writeback) 368 struct scan_control *sc)
331{ 369{
332 /* 370 /*
333 * If the page is dirty, only perform writeback if that write 371 * If the page is dirty, only perform writeback if that write
@@ -363,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
363 } 401 }
364 if (mapping->a_ops->writepage == NULL) 402 if (mapping->a_ops->writepage == NULL)
365 return PAGE_ACTIVATE; 403 return PAGE_ACTIVATE;
366 if (!may_write_to_queue(mapping->backing_dev_info)) 404 if (!may_write_to_queue(mapping->backing_dev_info, sc))
367 return PAGE_KEEP; 405 return PAGE_KEEP;
368 406
369 if (clear_page_dirty_for_io(page)) { 407 if (clear_page_dirty_for_io(page)) {
@@ -373,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
373 .nr_to_write = SWAP_CLUSTER_MAX, 411 .nr_to_write = SWAP_CLUSTER_MAX,
374 .range_start = 0, 412 .range_start = 0,
375 .range_end = LLONG_MAX, 413 .range_end = LLONG_MAX,
376 .nonblocking = 1,
377 .for_reclaim = 1, 414 .for_reclaim = 1,
378 }; 415 };
379 416
@@ -391,13 +428,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
391 * direct reclaiming a large contiguous area and the 428 * direct reclaiming a large contiguous area and the
392 * first attempt to free a range of pages fails. 429 * first attempt to free a range of pages fails.
393 */ 430 */
394 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) 431 if (PageWriteback(page) &&
432 sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
395 wait_on_page_writeback(page); 433 wait_on_page_writeback(page);
396 434
397 if (!PageWriteback(page)) { 435 if (!PageWriteback(page)) {
398 /* synchronous write or broken a_ops? */ 436 /* synchronous write or broken a_ops? */
399 ClearPageReclaim(page); 437 ClearPageReclaim(page);
400 } 438 }
439 trace_mm_vmscan_writepage(page,
440 trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
401 inc_zone_page_state(page, NR_VMSCAN_WRITE); 441 inc_zone_page_state(page, NR_VMSCAN_WRITE);
402 return PAGE_SUCCESS; 442 return PAGE_SUCCESS;
403 } 443 }
@@ -575,7 +615,7 @@ static enum page_references page_check_references(struct page *page,
575 referenced_page = TestClearPageReferenced(page); 615 referenced_page = TestClearPageReferenced(page);
576 616
577 /* Lumpy reclaim - ignore references */ 617 /* Lumpy reclaim - ignore references */
578 if (sc->lumpy_reclaim_mode) 618 if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
579 return PAGEREF_RECLAIM; 619 return PAGEREF_RECLAIM;
580 620
581 /* 621 /*
@@ -611,27 +651,46 @@ static enum page_references page_check_references(struct page *page,
611 } 651 }
612 652
613 /* Reclaim if clean, defer dirty pages to writeback */ 653 /* Reclaim if clean, defer dirty pages to writeback */
614 if (referenced_page) 654 if (referenced_page && !PageSwapBacked(page))
615 return PAGEREF_RECLAIM_CLEAN; 655 return PAGEREF_RECLAIM_CLEAN;
616 656
617 return PAGEREF_RECLAIM; 657 return PAGEREF_RECLAIM;
618} 658}
619 659
660static noinline_for_stack void free_page_list(struct list_head *free_pages)
661{
662 struct pagevec freed_pvec;
663 struct page *page, *tmp;
664
665 pagevec_init(&freed_pvec, 1);
666
667 list_for_each_entry_safe(page, tmp, free_pages, lru) {
668 list_del(&page->lru);
669 if (!pagevec_add(&freed_pvec, page)) {
670 __pagevec_free(&freed_pvec);
671 pagevec_reinit(&freed_pvec);
672 }
673 }
674
675 pagevec_free(&freed_pvec);
676}
677
620/* 678/*
621 * shrink_page_list() returns the number of reclaimed pages 679 * shrink_page_list() returns the number of reclaimed pages
622 */ 680 */
623static unsigned long shrink_page_list(struct list_head *page_list, 681static unsigned long shrink_page_list(struct list_head *page_list,
624 struct scan_control *sc, 682 struct zone *zone,
625 enum pageout_io sync_writeback) 683 struct scan_control *sc)
626{ 684{
627 LIST_HEAD(ret_pages); 685 LIST_HEAD(ret_pages);
628 struct pagevec freed_pvec; 686 LIST_HEAD(free_pages);
629 int pgactivate = 0; 687 int pgactivate = 0;
688 unsigned long nr_dirty = 0;
689 unsigned long nr_congested = 0;
630 unsigned long nr_reclaimed = 0; 690 unsigned long nr_reclaimed = 0;
631 691
632 cond_resched(); 692 cond_resched();
633 693
634 pagevec_init(&freed_pvec, 1);
635 while (!list_empty(page_list)) { 694 while (!list_empty(page_list)) {
636 enum page_references references; 695 enum page_references references;
637 struct address_space *mapping; 696 struct address_space *mapping;
@@ -647,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
647 goto keep; 706 goto keep;
648 707
649 VM_BUG_ON(PageActive(page)); 708 VM_BUG_ON(PageActive(page));
709 VM_BUG_ON(page_zone(page) != zone);
650 710
651 sc->nr_scanned++; 711 sc->nr_scanned++;
652 712
@@ -672,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
672 * for any page for which writeback has already 732 * for any page for which writeback has already
673 * started. 733 * started.
674 */ 734 */
675 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) 735 if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
736 may_enter_fs)
676 wait_on_page_writeback(page); 737 wait_on_page_writeback(page);
677 else 738 else {
678 goto keep_locked; 739 unlock_page(page);
740 goto keep_lumpy;
741 }
679 } 742 }
680 743
681 references = page_check_references(page, sc); 744 references = page_check_references(page, sc);
@@ -721,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
721 } 784 }
722 785
723 if (PageDirty(page)) { 786 if (PageDirty(page)) {
787 nr_dirty++;
788
724 if (references == PAGEREF_RECLAIM_CLEAN) 789 if (references == PAGEREF_RECLAIM_CLEAN)
725 goto keep_locked; 790 goto keep_locked;
726 if (!may_enter_fs) 791 if (!may_enter_fs)
@@ -729,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
729 goto keep_locked; 794 goto keep_locked;
730 795
731 /* Page is dirty, try to write it out here */ 796 /* Page is dirty, try to write it out here */
732 switch (pageout(page, mapping, sync_writeback)) { 797 switch (pageout(page, mapping, sc)) {
733 case PAGE_KEEP: 798 case PAGE_KEEP:
799 nr_congested++;
734 goto keep_locked; 800 goto keep_locked;
735 case PAGE_ACTIVATE: 801 case PAGE_ACTIVATE:
736 goto activate_locked; 802 goto activate_locked;
737 case PAGE_SUCCESS: 803 case PAGE_SUCCESS:
738 if (PageWriteback(page) || PageDirty(page)) 804 if (PageWriteback(page))
805 goto keep_lumpy;
806 if (PageDirty(page))
739 goto keep; 807 goto keep;
808
740 /* 809 /*
741 * A synchronous write - probably a ramdisk. Go 810 * A synchronous write - probably a ramdisk. Go
742 * ahead and try to reclaim the page. 811 * ahead and try to reclaim the page.
@@ -806,10 +875,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 __clear_page_locked(page); 875 __clear_page_locked(page);
807free_it: 876free_it:
808 nr_reclaimed++; 877 nr_reclaimed++;
809 if (!pagevec_add(&freed_pvec, page)) { 878
810 __pagevec_free(&freed_pvec); 879 /*
811 pagevec_reinit(&freed_pvec); 880 * Is there need to periodically free_page_list? It would
812 } 881 * appear not as the counts should be low
882 */
883 list_add(&page->lru, &free_pages);
813 continue; 884 continue;
814 885
815cull_mlocked: 886cull_mlocked:
@@ -817,6 +888,7 @@ cull_mlocked:
817 try_to_free_swap(page); 888 try_to_free_swap(page);
818 unlock_page(page); 889 unlock_page(page);
819 putback_lru_page(page); 890 putback_lru_page(page);
891 disable_lumpy_reclaim_mode(sc);
820 continue; 892 continue;
821 893
822activate_locked: 894activate_locked:
@@ -829,12 +901,24 @@ activate_locked:
829keep_locked: 901keep_locked:
830 unlock_page(page); 902 unlock_page(page);
831keep: 903keep:
904 disable_lumpy_reclaim_mode(sc);
905keep_lumpy:
832 list_add(&page->lru, &ret_pages); 906 list_add(&page->lru, &ret_pages);
833 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 907 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
834 } 908 }
909
910 /*
911 * Tag a zone as congested if all the dirty pages encountered were
912 * backed by a congested BDI. In this case, reclaimers should just
913 * back off and wait for congestion to clear because further reclaim
914 * will encounter the same problem
915 */
916 if (nr_dirty == nr_congested)
917 zone_set_flag(zone, ZONE_CONGESTED);
918
919 free_page_list(&free_pages);
920
835 list_splice(&ret_pages, page_list); 921 list_splice(&ret_pages, page_list);
836 if (pagevec_count(&freed_pvec))
837 __pagevec_free(&freed_pvec);
838 count_vm_events(PGACTIVATE, pgactivate); 922 count_vm_events(PGACTIVATE, pgactivate);
839 return nr_reclaimed; 923 return nr_reclaimed;
840} 924}
@@ -916,6 +1000,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
916 unsigned long *scanned, int order, int mode, int file) 1000 unsigned long *scanned, int order, int mode, int file)
917{ 1001{
918 unsigned long nr_taken = 0; 1002 unsigned long nr_taken = 0;
1003 unsigned long nr_lumpy_taken = 0;
1004 unsigned long nr_lumpy_dirty = 0;
1005 unsigned long nr_lumpy_failed = 0;
919 unsigned long scan; 1006 unsigned long scan;
920 1007
921 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1008 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -978,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
978 1065
979 /* Check that we have not crossed a zone boundary. */ 1066 /* Check that we have not crossed a zone boundary. */
980 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1067 if (unlikely(page_zone_id(cursor_page) != zone_id))
981 continue; 1068 break;
982 1069
983 /* 1070 /*
984 * If we don't have enough swap space, reclaiming of 1071 * If we don't have enough swap space, reclaiming of
@@ -986,19 +1073,37 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
986 * pointless. 1073 * pointless.
987 */ 1074 */
988 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1075 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
989 !PageSwapCache(cursor_page)) 1076 !PageSwapCache(cursor_page))
990 continue; 1077 break;
991 1078
992 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1079 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
993 list_move(&cursor_page->lru, dst); 1080 list_move(&cursor_page->lru, dst);
994 mem_cgroup_del_lru(cursor_page); 1081 mem_cgroup_del_lru(cursor_page);
995 nr_taken++; 1082 nr_taken++;
1083 nr_lumpy_taken++;
1084 if (PageDirty(cursor_page))
1085 nr_lumpy_dirty++;
996 scan++; 1086 scan++;
1087 } else {
1088 /* the page is freed already. */
1089 if (!page_count(cursor_page))
1090 continue;
1091 break;
997 } 1092 }
998 } 1093 }
1094
1095 /* If we break out of the loop above, lumpy reclaim failed */
1096 if (pfn < end_pfn)
1097 nr_lumpy_failed++;
999 } 1098 }
1000 1099
1001 *scanned = scan; 1100 *scanned = scan;
1101
1102 trace_mm_vmscan_lru_isolate(order,
1103 nr_to_scan, scan,
1104 nr_taken,
1105 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1106 mode);
1002 return nr_taken; 1107 return nr_taken;
1003} 1108}
1004 1109
@@ -1035,7 +1140,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1035 ClearPageActive(page); 1140 ClearPageActive(page);
1036 nr_active++; 1141 nr_active++;
1037 } 1142 }
1038 count[lru]++; 1143 if (count)
1144 count[lru]++;
1039 } 1145 }
1040 1146
1041 return nr_active; 1147 return nr_active;
@@ -1112,174 +1218,209 @@ static int too_many_isolated(struct zone *zone, int file,
1112} 1218}
1113 1219
1114/* 1220/*
1115 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1221 * TODO: Try merging with migrations version of putback_lru_pages
1116 * of reclaimed pages
1117 */ 1222 */
1118static unsigned long shrink_inactive_list(unsigned long max_scan, 1223static noinline_for_stack void
1119 struct zone *zone, struct scan_control *sc, 1224putback_lru_pages(struct zone *zone, struct scan_control *sc,
1120 int priority, int file) 1225 unsigned long nr_anon, unsigned long nr_file,
1226 struct list_head *page_list)
1121{ 1227{
1122 LIST_HEAD(page_list); 1228 struct page *page;
1123 struct pagevec pvec; 1229 struct pagevec pvec;
1124 unsigned long nr_scanned = 0;
1125 unsigned long nr_reclaimed = 0;
1126 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1230 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1127 1231
1128 while (unlikely(too_many_isolated(zone, file, sc))) { 1232 pagevec_init(&pvec, 1);
1129 congestion_wait(BLK_RW_ASYNC, HZ/10);
1130 1233
1131 /* We are about to die and free our memory. Return now. */ 1234 /*
1132 if (fatal_signal_pending(current)) 1235 * Put back any unfreeable pages.
1133 return SWAP_CLUSTER_MAX; 1236 */
1237 spin_lock(&zone->lru_lock);
1238 while (!list_empty(page_list)) {
1239 int lru;
1240 page = lru_to_page(page_list);
1241 VM_BUG_ON(PageLRU(page));
1242 list_del(&page->lru);
1243 if (unlikely(!page_evictable(page, NULL))) {
1244 spin_unlock_irq(&zone->lru_lock);
1245 putback_lru_page(page);
1246 spin_lock_irq(&zone->lru_lock);
1247 continue;
1248 }
1249 SetPageLRU(page);
1250 lru = page_lru(page);
1251 add_page_to_lru_list(zone, page, lru);
1252 if (is_active_lru(lru)) {
1253 int file = is_file_lru(lru);
1254 reclaim_stat->recent_rotated[file]++;
1255 }
1256 if (!pagevec_add(&pvec, page)) {
1257 spin_unlock_irq(&zone->lru_lock);
1258 __pagevec_release(&pvec);
1259 spin_lock_irq(&zone->lru_lock);
1260 }
1134 } 1261 }
1262 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1263 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1264
1265 spin_unlock_irq(&zone->lru_lock);
1266 pagevec_release(&pvec);
1267}
1135 1268
1269static noinline_for_stack void update_isolated_counts(struct zone *zone,
1270 struct scan_control *sc,
1271 unsigned long *nr_anon,
1272 unsigned long *nr_file,
1273 struct list_head *isolated_list)
1274{
1275 unsigned long nr_active;
1276 unsigned int count[NR_LRU_LISTS] = { 0, };
1277 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1136 1278
1137 pagevec_init(&pvec, 1); 1279 nr_active = clear_active_flags(isolated_list, count);
1280 __count_vm_events(PGDEACTIVATE, nr_active);
1138 1281
1139 lru_add_drain(); 1282 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1140 spin_lock_irq(&zone->lru_lock); 1283 -count[LRU_ACTIVE_FILE]);
1141 do { 1284 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1142 struct page *page; 1285 -count[LRU_INACTIVE_FILE]);
1143 unsigned long nr_taken; 1286 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1144 unsigned long nr_scan; 1287 -count[LRU_ACTIVE_ANON]);
1145 unsigned long nr_freed; 1288 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1146 unsigned long nr_active; 1289 -count[LRU_INACTIVE_ANON]);
1147 unsigned int count[NR_LRU_LISTS] = { 0, };
1148 int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1149 unsigned long nr_anon;
1150 unsigned long nr_file;
1151 1290
1152 if (scanning_global_lru(sc)) { 1291 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1153 nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, 1292 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1154 &page_list, &nr_scan, 1293 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1155 sc->order, mode, 1294 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1156 zone, 0, file);
1157 zone->pages_scanned += nr_scan;
1158 if (current_is_kswapd())
1159 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1160 nr_scan);
1161 else
1162 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1163 nr_scan);
1164 } else {
1165 nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
1166 &page_list, &nr_scan,
1167 sc->order, mode,
1168 zone, sc->mem_cgroup,
1169 0, file);
1170 /*
1171 * mem_cgroup_isolate_pages() keeps track of
1172 * scanned pages on its own.
1173 */
1174 }
1175 1295
1176 if (nr_taken == 0) 1296 reclaim_stat->recent_scanned[0] += *nr_anon;
1177 goto done; 1297 reclaim_stat->recent_scanned[1] += *nr_file;
1298}
1178 1299
1179 nr_active = clear_active_flags(&page_list, count); 1300/*
1180 __count_vm_events(PGDEACTIVATE, nr_active); 1301 * Returns true if the caller should wait to clean dirty/writeback pages.
1302 *
1303 * If we are direct reclaiming for contiguous pages and we do not reclaim
1304 * everything in the list, try again and wait for writeback IO to complete.
1305 * This will stall high-order allocations noticeably. Only do that when really
1306 * need to free the pages under high memory pressure.
1307 */
1308static inline bool should_reclaim_stall(unsigned long nr_taken,
1309 unsigned long nr_freed,
1310 int priority,
1311 struct scan_control *sc)
1312{
1313 int lumpy_stall_priority;
1181 1314
1182 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1315 /* kswapd should not stall on sync IO */
1183 -count[LRU_ACTIVE_FILE]); 1316 if (current_is_kswapd())
1184 __mod_zone_page_state(zone, NR_INACTIVE_FILE, 1317 return false;
1185 -count[LRU_INACTIVE_FILE]);
1186 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1187 -count[LRU_ACTIVE_ANON]);
1188 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1189 -count[LRU_INACTIVE_ANON]);
1190 1318
1191 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1319 /* Only stall on lumpy reclaim */
1192 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1320 if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
1193 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); 1321 return false;
1194 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1195 1322
1196 reclaim_stat->recent_scanned[0] += nr_anon; 1323 /* If we have relaimed everything on the isolated list, no stall */
1197 reclaim_stat->recent_scanned[1] += nr_file; 1324 if (nr_freed == nr_taken)
1325 return false;
1198 1326
1199 spin_unlock_irq(&zone->lru_lock); 1327 /*
1328 * For high-order allocations, there are two stall thresholds.
1329 * High-cost allocations stall immediately where as lower
1330 * order allocations such as stacks require the scanning
1331 * priority to be much higher before stalling.
1332 */
1333 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1334 lumpy_stall_priority = DEF_PRIORITY;
1335 else
1336 lumpy_stall_priority = DEF_PRIORITY / 3;
1200 1337
1201 nr_scanned += nr_scan; 1338 return priority <= lumpy_stall_priority;
1202 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1339}
1203 1340
1204 /* 1341/*
1205 * If we are direct reclaiming for contiguous pages and we do 1342 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1206 * not reclaim everything in the list, try again and wait 1343 * of reclaimed pages
1207 * for IO to complete. This will stall high-order allocations 1344 */
1208 * but that should be acceptable to the caller 1345static noinline_for_stack unsigned long
1209 */ 1346shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1210 if (nr_freed < nr_taken && !current_is_kswapd() && 1347 struct scan_control *sc, int priority, int file)
1211 sc->lumpy_reclaim_mode) { 1348{
1212 congestion_wait(BLK_RW_ASYNC, HZ/10); 1349 LIST_HEAD(page_list);
1350 unsigned long nr_scanned;
1351 unsigned long nr_reclaimed = 0;
1352 unsigned long nr_taken;
1353 unsigned long nr_anon;
1354 unsigned long nr_file;
1213 1355
1214 /* 1356 while (unlikely(too_many_isolated(zone, file, sc))) {
1215 * The attempt at page out may have made some 1357 congestion_wait(BLK_RW_ASYNC, HZ/10);
1216 * of the pages active, mark them inactive again.
1217 */
1218 nr_active = clear_active_flags(&page_list, count);
1219 count_vm_events(PGDEACTIVATE, nr_active);
1220 1358
1221 nr_freed += shrink_page_list(&page_list, sc, 1359 /* We are about to die and free our memory. Return now. */
1222 PAGEOUT_IO_SYNC); 1360 if (fatal_signal_pending(current))
1223 } 1361 return SWAP_CLUSTER_MAX;
1362 }
1224 1363
1225 nr_reclaimed += nr_freed; 1364 set_lumpy_reclaim_mode(priority, sc, false);
1365 lru_add_drain();
1366 spin_lock_irq(&zone->lru_lock);
1226 1367
1227 local_irq_disable(); 1368 if (scanning_global_lru(sc)) {
1369 nr_taken = isolate_pages_global(nr_to_scan,
1370 &page_list, &nr_scanned, sc->order,
1371 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
1372 ISOLATE_INACTIVE : ISOLATE_BOTH,
1373 zone, 0, file);
1374 zone->pages_scanned += nr_scanned;
1228 if (current_is_kswapd()) 1375 if (current_is_kswapd())
1229 __count_vm_events(KSWAPD_STEAL, nr_freed); 1376 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1230 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1377 nr_scanned);
1231 1378 else
1232 spin_lock(&zone->lru_lock); 1379 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1380 nr_scanned);
1381 } else {
1382 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1383 &page_list, &nr_scanned, sc->order,
1384 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
1385 ISOLATE_INACTIVE : ISOLATE_BOTH,
1386 zone, sc->mem_cgroup,
1387 0, file);
1233 /* 1388 /*
1234 * Put back any unfreeable pages. 1389 * mem_cgroup_isolate_pages() keeps track of
1390 * scanned pages on its own.
1235 */ 1391 */
1236 while (!list_empty(&page_list)) { 1392 }
1237 int lru;
1238 page = lru_to_page(&page_list);
1239 VM_BUG_ON(PageLRU(page));
1240 list_del(&page->lru);
1241 if (unlikely(!page_evictable(page, NULL))) {
1242 spin_unlock_irq(&zone->lru_lock);
1243 putback_lru_page(page);
1244 spin_lock_irq(&zone->lru_lock);
1245 continue;
1246 }
1247 SetPageLRU(page);
1248 lru = page_lru(page);
1249 add_page_to_lru_list(zone, page, lru);
1250 if (is_active_lru(lru)) {
1251 int file = is_file_lru(lru);
1252 reclaim_stat->recent_rotated[file]++;
1253 }
1254 if (!pagevec_add(&pvec, page)) {
1255 spin_unlock_irq(&zone->lru_lock);
1256 __pagevec_release(&pvec);
1257 spin_lock_irq(&zone->lru_lock);
1258 }
1259 }
1260 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1261 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1262 1393
1263 } while (nr_scanned < max_scan); 1394 if (nr_taken == 0) {
1395 spin_unlock_irq(&zone->lru_lock);
1396 return 0;
1397 }
1398
1399 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1264 1400
1265done:
1266 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1267 pagevec_release(&pvec);
1268 return nr_reclaimed;
1269}
1270 1402
1271/* 1403 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1272 * We are about to scan this zone at a certain priority level. If that priority 1404
1273 * level is smaller (ie: more urgent) than the previous priority, then note 1405 /* Check if we should syncronously wait for writeback */
1274 * that priority level within the zone. This is done so that when the next 1406 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1275 * process comes in to scan this zone, it will immediately start out at this 1407 set_lumpy_reclaim_mode(priority, sc, true);
1276 * priority level rather than having to build up its own scanning priority. 1408 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1277 * Here, this priority affects only the reclaim-mapped threshold. 1409 }
1278 */ 1410
1279static inline void note_zone_scanning_priority(struct zone *zone, int priority) 1411 local_irq_disable();
1280{ 1412 if (current_is_kswapd())
1281 if (priority < zone->prev_priority) 1413 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1282 zone->prev_priority = priority; 1414 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1415
1416 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1417
1418 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1419 zone_idx(zone),
1420 nr_scanned, nr_reclaimed,
1421 priority,
1422 trace_shrink_flags(file, sc->lumpy_reclaim_mode));
1423 return nr_reclaimed;
1283} 1424}
1284 1425
1285/* 1426/*
@@ -1426,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1426 spin_unlock_irq(&zone->lru_lock); 1567 spin_unlock_irq(&zone->lru_lock);
1427} 1568}
1428 1569
1570#ifdef CONFIG_SWAP
1429static int inactive_anon_is_low_global(struct zone *zone) 1571static int inactive_anon_is_low_global(struct zone *zone)
1430{ 1572{
1431 unsigned long active, inactive; 1573 unsigned long active, inactive;
@@ -1451,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1451{ 1593{
1452 int low; 1594 int low;
1453 1595
1596 /*
1597 * If we don't have swap space, anonymous page deactivation
1598 * is pointless.
1599 */
1600 if (!total_swap_pages)
1601 return 0;
1602
1454 if (scanning_global_lru(sc)) 1603 if (scanning_global_lru(sc))
1455 low = inactive_anon_is_low_global(zone); 1604 low = inactive_anon_is_low_global(zone);
1456 else 1605 else
1457 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1606 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1458 return low; 1607 return low;
1459} 1608}
1609#else
1610static inline int inactive_anon_is_low(struct zone *zone,
1611 struct scan_control *sc)
1612{
1613 return 0;
1614}
1615#endif
1460 1616
1461static int inactive_file_is_low_global(struct zone *zone) 1617static int inactive_file_is_low_global(struct zone *zone)
1462{ 1618{
@@ -1583,6 +1739,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1583 } 1739 }
1584 1740
1585 /* 1741 /*
1742 * With swappiness at 100, anonymous and file have the same priority.
1743 * This scanning priority is essentially the inverse of IO cost.
1744 */
1745 anon_prio = sc->swappiness;
1746 file_prio = 200 - sc->swappiness;
1747
1748 /*
1586 * OK, so we have swap space and a fair amount of page cache 1749 * OK, so we have swap space and a fair amount of page cache
1587 * pages. We use the recently rotated / recently scanned 1750 * pages. We use the recently rotated / recently scanned
1588 * ratios to determine how valuable each cache is. 1751 * ratios to determine how valuable each cache is.
@@ -1593,28 +1756,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1593 * 1756 *
1594 * anon in [0], file in [1] 1757 * anon in [0], file in [1]
1595 */ 1758 */
1759 spin_lock_irq(&zone->lru_lock);
1596 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1760 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1597 spin_lock_irq(&zone->lru_lock);
1598 reclaim_stat->recent_scanned[0] /= 2; 1761 reclaim_stat->recent_scanned[0] /= 2;
1599 reclaim_stat->recent_rotated[0] /= 2; 1762 reclaim_stat->recent_rotated[0] /= 2;
1600 spin_unlock_irq(&zone->lru_lock);
1601 } 1763 }
1602 1764
1603 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1765 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1604 spin_lock_irq(&zone->lru_lock);
1605 reclaim_stat->recent_scanned[1] /= 2; 1766 reclaim_stat->recent_scanned[1] /= 2;
1606 reclaim_stat->recent_rotated[1] /= 2; 1767 reclaim_stat->recent_rotated[1] /= 2;
1607 spin_unlock_irq(&zone->lru_lock);
1608 } 1768 }
1609 1769
1610 /* 1770 /*
1611 * With swappiness at 100, anonymous and file have the same priority.
1612 * This scanning priority is essentially the inverse of IO cost.
1613 */
1614 anon_prio = sc->swappiness;
1615 file_prio = 200 - sc->swappiness;
1616
1617 /*
1618 * The amount of pressure on anon vs file pages is inversely 1771 * The amount of pressure on anon vs file pages is inversely
1619 * proportional to the fraction of recently scanned pages on 1772 * proportional to the fraction of recently scanned pages on
1620 * each list that were recently referenced and in active use. 1773 * each list that were recently referenced and in active use.
@@ -1624,6 +1777,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1624 1777
1625 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1778 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1626 fp /= reclaim_stat->recent_rotated[1] + 1; 1779 fp /= reclaim_stat->recent_rotated[1] + 1;
1780 spin_unlock_irq(&zone->lru_lock);
1627 1781
1628 fraction[0] = ap; 1782 fraction[0] = ap;
1629 fraction[1] = fp; 1783 fraction[1] = fp;
@@ -1643,21 +1797,6 @@ out:
1643 } 1797 }
1644} 1798}
1645 1799
1646static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
1647{
1648 /*
1649 * If we need a large contiguous chunk of memory, or have
1650 * trouble getting a small set of contiguous pages, we
1651 * will reclaim both active and inactive pages.
1652 */
1653 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1654 sc->lumpy_reclaim_mode = 1;
1655 else if (sc->order && priority < DEF_PRIORITY - 2)
1656 sc->lumpy_reclaim_mode = 1;
1657 else
1658 sc->lumpy_reclaim_mode = 0;
1659}
1660
1661/* 1800/*
1662 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1801 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1663 */ 1802 */
@@ -1672,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone,
1672 1811
1673 get_scan_count(zone, sc, nr, priority); 1812 get_scan_count(zone, sc, nr, priority);
1674 1813
1675 set_lumpy_reclaim_mode(priority, sc);
1676
1677 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1814 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1678 nr[LRU_INACTIVE_FILE]) { 1815 nr[LRU_INACTIVE_FILE]) {
1679 for_each_evictable_lru(l) { 1816 for_each_evictable_lru(l) {
@@ -1704,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone,
1704 * Even if we did not try to evict anon pages at all, we want to 1841 * Even if we did not try to evict anon pages at all, we want to
1705 * rebalance the anon lru active/inactive ratio. 1842 * rebalance the anon lru active/inactive ratio.
1706 */ 1843 */
1707 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) 1844 if (inactive_anon_is_low(zone, sc))
1708 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1845 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1709 1846
1710 throttle_vm_writeout(sc->gfp_mask); 1847 throttle_vm_writeout(sc->gfp_mask);
@@ -1726,16 +1863,14 @@ static void shrink_zone(int priority, struct zone *zone,
1726 * If a zone is deemed to be full of pinned pages then just give it a light 1863 * If a zone is deemed to be full of pinned pages then just give it a light
1727 * scan then give up on it. 1864 * scan then give up on it.
1728 */ 1865 */
1729static bool shrink_zones(int priority, struct zonelist *zonelist, 1866static void shrink_zones(int priority, struct zonelist *zonelist,
1730 struct scan_control *sc) 1867 struct scan_control *sc)
1731{ 1868{
1732 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1733 struct zoneref *z; 1869 struct zoneref *z;
1734 struct zone *zone; 1870 struct zone *zone;
1735 bool all_unreclaimable = true;
1736 1871
1737 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1872 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1738 sc->nodemask) { 1873 gfp_zone(sc->gfp_mask), sc->nodemask) {
1739 if (!populated_zone(zone)) 1874 if (!populated_zone(zone))
1740 continue; 1875 continue;
1741 /* 1876 /*
@@ -1745,22 +1880,43 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
1745 if (scanning_global_lru(sc)) { 1880 if (scanning_global_lru(sc)) {
1746 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1881 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1747 continue; 1882 continue;
1748 note_zone_scanning_priority(zone, priority);
1749
1750 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1883 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1751 continue; /* Let kswapd poll it */ 1884 continue; /* Let kswapd poll it */
1752 } else {
1753 /*
1754 * Ignore cpuset limitation here. We just want to reduce
1755 * # of used pages by us regardless of memory shortage.
1756 */
1757 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1758 priority);
1759 } 1885 }
1760 1886
1761 shrink_zone(priority, zone, sc); 1887 shrink_zone(priority, zone, sc);
1762 all_unreclaimable = false;
1763 } 1888 }
1889}
1890
1891static bool zone_reclaimable(struct zone *zone)
1892{
1893 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1894}
1895
1896/*
1897 * As hibernation is going on, kswapd is freezed so that it can't mark
1898 * the zone into all_unreclaimable. It can't handle OOM during hibernation.
1899 * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
1900 */
1901static bool all_unreclaimable(struct zonelist *zonelist,
1902 struct scan_control *sc)
1903{
1904 struct zoneref *z;
1905 struct zone *zone;
1906 bool all_unreclaimable = true;
1907
1908 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1909 gfp_zone(sc->gfp_mask), sc->nodemask) {
1910 if (!populated_zone(zone))
1911 continue;
1912 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1913 continue;
1914 if (zone_reclaimable(zone)) {
1915 all_unreclaimable = false;
1916 break;
1917 }
1918 }
1919
1764 return all_unreclaimable; 1920 return all_unreclaimable;
1765} 1921}
1766 1922
@@ -1784,13 +1940,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1784 struct scan_control *sc) 1940 struct scan_control *sc)
1785{ 1941{
1786 int priority; 1942 int priority;
1787 bool all_unreclaimable;
1788 unsigned long total_scanned = 0; 1943 unsigned long total_scanned = 0;
1789 struct reclaim_state *reclaim_state = current->reclaim_state; 1944 struct reclaim_state *reclaim_state = current->reclaim_state;
1790 unsigned long lru_pages = 0;
1791 struct zoneref *z; 1945 struct zoneref *z;
1792 struct zone *zone; 1946 struct zone *zone;
1793 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1794 unsigned long writeback_threshold; 1947 unsigned long writeback_threshold;
1795 1948
1796 get_mems_allowed(); 1949 get_mems_allowed();
@@ -1798,29 +1951,26 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1798 1951
1799 if (scanning_global_lru(sc)) 1952 if (scanning_global_lru(sc))
1800 count_vm_event(ALLOCSTALL); 1953 count_vm_event(ALLOCSTALL);
1801 /*
1802 * mem_cgroup will not do shrink_slab.
1803 */
1804 if (scanning_global_lru(sc)) {
1805 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1806
1807 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1808 continue;
1809
1810 lru_pages += zone_reclaimable_pages(zone);
1811 }
1812 }
1813 1954
1814 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1955 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1815 sc->nr_scanned = 0; 1956 sc->nr_scanned = 0;
1816 if (!priority) 1957 if (!priority)
1817 disable_swap_token(); 1958 disable_swap_token();
1818 all_unreclaimable = shrink_zones(priority, zonelist, sc); 1959 shrink_zones(priority, zonelist, sc);
1819 /* 1960 /*
1820 * Don't shrink slabs when reclaiming memory from 1961 * Don't shrink slabs when reclaiming memory from
1821 * over limit cgroups 1962 * over limit cgroups
1822 */ 1963 */
1823 if (scanning_global_lru(sc)) { 1964 if (scanning_global_lru(sc)) {
1965 unsigned long lru_pages = 0;
1966 for_each_zone_zonelist(zone, z, zonelist,
1967 gfp_zone(sc->gfp_mask)) {
1968 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1969 continue;
1970
1971 lru_pages += zone_reclaimable_pages(zone);
1972 }
1973
1824 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1974 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1825 if (reclaim_state) { 1975 if (reclaim_state) {
1826 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 1976 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1846,32 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1846 1996
1847 /* Take a nap, wait for some writeback to complete */ 1997 /* Take a nap, wait for some writeback to complete */
1848 if (!sc->hibernation_mode && sc->nr_scanned && 1998 if (!sc->hibernation_mode && sc->nr_scanned &&
1849 priority < DEF_PRIORITY - 2) 1999 priority < DEF_PRIORITY - 2) {
1850 congestion_wait(BLK_RW_ASYNC, HZ/10); 2000 struct zone *preferred_zone;
1851 }
1852
1853out:
1854 /*
1855 * Now that we've scanned all the zones at this priority level, note
1856 * that level within the zone so that the next thread which performs
1857 * scanning of this zone will immediately start out at this priority
1858 * level. This affects only the decision whether or not to bring
1859 * mapped pages onto the inactive list.
1860 */
1861 if (priority < 0)
1862 priority = 0;
1863
1864 if (scanning_global_lru(sc)) {
1865 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1866 2001
1867 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2002 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
1868 continue; 2003 NULL, &preferred_zone);
1869 2004 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
1870 zone->prev_priority = priority;
1871 } 2005 }
1872 } else 2006 }
1873 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1874 2007
2008out:
1875 delayacct_freepages_end(); 2009 delayacct_freepages_end();
1876 put_mems_allowed(); 2010 put_mems_allowed();
1877 2011
@@ -1879,7 +2013,7 @@ out:
1879 return sc->nr_reclaimed; 2013 return sc->nr_reclaimed;
1880 2014
1881 /* top priority shrink_zones still had more to do? don't OOM, then */ 2015 /* top priority shrink_zones still had more to do? don't OOM, then */
1882 if (scanning_global_lru(sc) && !all_unreclaimable) 2016 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
1883 return 1; 2017 return 1;
1884 2018
1885 return 0; 2019 return 0;
@@ -1888,6 +2022,7 @@ out:
1888unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2022unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1889 gfp_t gfp_mask, nodemask_t *nodemask) 2023 gfp_t gfp_mask, nodemask_t *nodemask)
1890{ 2024{
2025 unsigned long nr_reclaimed;
1891 struct scan_control sc = { 2026 struct scan_control sc = {
1892 .gfp_mask = gfp_mask, 2027 .gfp_mask = gfp_mask,
1893 .may_writepage = !laptop_mode, 2028 .may_writepage = !laptop_mode,
@@ -1900,7 +2035,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1900 .nodemask = nodemask, 2035 .nodemask = nodemask,
1901 }; 2036 };
1902 2037
1903 return do_try_to_free_pages(zonelist, &sc); 2038 trace_mm_vmscan_direct_reclaim_begin(order,
2039 sc.may_writepage,
2040 gfp_mask);
2041
2042 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2043
2044 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2045
2046 return nr_reclaimed;
1904} 2047}
1905 2048
1906#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2049#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1908,9 +2051,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1908unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2051unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1909 gfp_t gfp_mask, bool noswap, 2052 gfp_t gfp_mask, bool noswap,
1910 unsigned int swappiness, 2053 unsigned int swappiness,
1911 struct zone *zone, int nid) 2054 struct zone *zone)
1912{ 2055{
1913 struct scan_control sc = { 2056 struct scan_control sc = {
2057 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1914 .may_writepage = !laptop_mode, 2058 .may_writepage = !laptop_mode,
1915 .may_unmap = 1, 2059 .may_unmap = 1,
1916 .may_swap = !noswap, 2060 .may_swap = !noswap,
@@ -1918,13 +2062,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1918 .order = 0, 2062 .order = 0,
1919 .mem_cgroup = mem, 2063 .mem_cgroup = mem,
1920 }; 2064 };
1921 nodemask_t nm = nodemask_of_node(nid);
1922
1923 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2065 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1924 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2066 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1925 sc.nodemask = &nm; 2067
1926 sc.nr_reclaimed = 0; 2068 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
1927 sc.nr_scanned = 0; 2069 sc.may_writepage,
2070 sc.gfp_mask);
2071
1928 /* 2072 /*
1929 * NOTE: Although we can get the priority field, using it 2073 * NOTE: Although we can get the priority field, using it
1930 * here is not a good idea, since it limits the pages we can scan. 2074 * here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +2077,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1933 * the priority and make it zero. 2077 * the priority and make it zero.
1934 */ 2078 */
1935 shrink_zone(0, zone, &sc); 2079 shrink_zone(0, zone, &sc);
2080
2081 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2082
1936 return sc.nr_reclaimed; 2083 return sc.nr_reclaimed;
1937} 2084}
1938 2085
@@ -1942,6 +2089,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1942 unsigned int swappiness) 2089 unsigned int swappiness)
1943{ 2090{
1944 struct zonelist *zonelist; 2091 struct zonelist *zonelist;
2092 unsigned long nr_reclaimed;
1945 struct scan_control sc = { 2093 struct scan_control sc = {
1946 .may_writepage = !laptop_mode, 2094 .may_writepage = !laptop_mode,
1947 .may_unmap = 1, 2095 .may_unmap = 1,
@@ -1956,7 +2104,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1956 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2104 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1957 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2105 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1958 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2106 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1959 return do_try_to_free_pages(zonelist, &sc); 2107
2108 trace_mm_vmscan_memcg_reclaim_begin(0,
2109 sc.may_writepage,
2110 sc.gfp_mask);
2111
2112 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2113
2114 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2115
2116 return nr_reclaimed;
1960} 2117}
1961#endif 2118#endif
1962 2119
@@ -2028,22 +2185,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2028 .order = order, 2185 .order = order,
2029 .mem_cgroup = NULL, 2186 .mem_cgroup = NULL,
2030 }; 2187 };
2031 /*
2032 * temp_priority is used to remember the scanning priority at which
2033 * this zone was successfully refilled to
2034 * free_pages == high_wmark_pages(zone).
2035 */
2036 int temp_priority[MAX_NR_ZONES];
2037
2038loop_again: 2188loop_again:
2039 total_scanned = 0; 2189 total_scanned = 0;
2040 sc.nr_reclaimed = 0; 2190 sc.nr_reclaimed = 0;
2041 sc.may_writepage = !laptop_mode; 2191 sc.may_writepage = !laptop_mode;
2042 count_vm_event(PAGEOUTRUN); 2192 count_vm_event(PAGEOUTRUN);
2043 2193
2044 for (i = 0; i < pgdat->nr_zones; i++)
2045 temp_priority[i] = DEF_PRIORITY;
2046
2047 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2194 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2048 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2195 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2049 unsigned long lru_pages = 0; 2196 unsigned long lru_pages = 0;
@@ -2103,7 +2250,6 @@ loop_again:
2103 for (i = 0; i <= end_zone; i++) { 2250 for (i = 0; i <= end_zone; i++) {
2104 struct zone *zone = pgdat->node_zones + i; 2251 struct zone *zone = pgdat->node_zones + i;
2105 int nr_slab; 2252 int nr_slab;
2106 int nid, zid;
2107 2253
2108 if (!populated_zone(zone)) 2254 if (!populated_zone(zone))
2109 continue; 2255 continue;
@@ -2111,18 +2257,14 @@ loop_again:
2111 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2257 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2112 continue; 2258 continue;
2113 2259
2114 temp_priority[i] = priority;
2115 sc.nr_scanned = 0; 2260 sc.nr_scanned = 0;
2116 note_zone_scanning_priority(zone, priority);
2117 2261
2118 nid = pgdat->node_id;
2119 zid = zone_idx(zone);
2120 /* 2262 /*
2121 * Call soft limit reclaim before calling shrink_zone. 2263 * Call soft limit reclaim before calling shrink_zone.
2122 * For now we ignore the return value 2264 * For now we ignore the return value
2123 */ 2265 */
2124 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, 2266 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2125 nid, zid); 2267
2126 /* 2268 /*
2127 * We put equal pressure on every zone, unless one 2269 * We put equal pressure on every zone, unless one
2128 * zone has way too many pages free already. 2270 * zone has way too many pages free already.
@@ -2137,8 +2279,7 @@ loop_again:
2137 total_scanned += sc.nr_scanned; 2279 total_scanned += sc.nr_scanned;
2138 if (zone->all_unreclaimable) 2280 if (zone->all_unreclaimable)
2139 continue; 2281 continue;
2140 if (nr_slab == 0 && 2282 if (nr_slab == 0 && !zone_reclaimable(zone))
2141 zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
2142 zone->all_unreclaimable = 1; 2283 zone->all_unreclaimable = 1;
2143 /* 2284 /*
2144 * If we've done a decent amount of scanning and 2285 * If we've done a decent amount of scanning and
@@ -2160,6 +2301,15 @@ loop_again:
2160 if (!zone_watermark_ok(zone, order, 2301 if (!zone_watermark_ok(zone, order,
2161 min_wmark_pages(zone), end_zone, 0)) 2302 min_wmark_pages(zone), end_zone, 0))
2162 has_under_min_watermark_zone = 1; 2303 has_under_min_watermark_zone = 1;
2304 } else {
2305 /*
2306 * If a zone reaches its high watermark,
2307 * consider it to be no longer congested. It's
2308 * possible there are dirty pages backed by
2309 * congested BDIs but as pressure is relieved,
2310 * spectulatively avoid congestion waits
2311 */
2312 zone_clear_flag(zone, ZONE_CONGESTED);
2163 } 2313 }
2164 2314
2165 } 2315 }
@@ -2186,16 +2336,6 @@ loop_again:
2186 break; 2336 break;
2187 } 2337 }
2188out: 2338out:
2189 /*
2190 * Note within each zone the priority level at which this zone was
2191 * brought into a happy state. So that the next thread which scans this
2192 * zone will start out at that priority level.
2193 */
2194 for (i = 0; i < pgdat->nr_zones; i++) {
2195 struct zone *zone = pgdat->node_zones + i;
2196
2197 zone->prev_priority = temp_priority[i];
2198 }
2199 if (!all_zones_ok) { 2339 if (!all_zones_ok) {
2200 cond_resched(); 2340 cond_resched();
2201 2341
@@ -2299,9 +2439,10 @@ static int kswapd(void *p)
2299 * premature sleep. If not, then go fully 2439 * premature sleep. If not, then go fully
2300 * to sleep until explicitly woken up 2440 * to sleep until explicitly woken up
2301 */ 2441 */
2302 if (!sleeping_prematurely(pgdat, order, remaining)) 2442 if (!sleeping_prematurely(pgdat, order, remaining)) {
2443 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2303 schedule(); 2444 schedule();
2304 else { 2445 } else {
2305 if (remaining) 2446 if (remaining)
2306 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2447 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2307 else 2448 else
@@ -2321,8 +2462,10 @@ static int kswapd(void *p)
2321 * We can speed up thawing tasks if we don't call balance_pgdat 2462 * We can speed up thawing tasks if we don't call balance_pgdat
2322 * after returning from the refrigerator 2463 * after returning from the refrigerator
2323 */ 2464 */
2324 if (!ret) 2465 if (!ret) {
2466 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2325 balance_pgdat(pgdat, order); 2467 balance_pgdat(pgdat, order);
2468 }
2326 } 2469 }
2327 return 0; 2470 return 0;
2328} 2471}
@@ -2342,6 +2485,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2342 return; 2485 return;
2343 if (pgdat->kswapd_max_order < order) 2486 if (pgdat->kswapd_max_order < order)
2344 pgdat->kswapd_max_order = order; 2487 pgdat->kswapd_max_order = order;
2488 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2345 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2489 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2346 return; 2490 return;
2347 if (!waitqueue_active(&pgdat->kswapd_wait)) 2491 if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2734,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2590 .swappiness = vm_swappiness, 2734 .swappiness = vm_swappiness,
2591 .order = order, 2735 .order = order,
2592 }; 2736 };
2593 unsigned long slab_reclaimable; 2737 unsigned long nr_slab_pages0, nr_slab_pages1;
2594 2738
2595 disable_swap_token();
2596 cond_resched(); 2739 cond_resched();
2597 /* 2740 /*
2598 * We need to be able to allocate from the reserves for RECLAIM_SWAP 2741 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2754,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2611 */ 2754 */
2612 priority = ZONE_RECLAIM_PRIORITY; 2755 priority = ZONE_RECLAIM_PRIORITY;
2613 do { 2756 do {
2614 note_zone_scanning_priority(zone, priority);
2615 shrink_zone(priority, zone, &sc); 2757 shrink_zone(priority, zone, &sc);
2616 priority--; 2758 priority--;
2617 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 2759 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2618 } 2760 }
2619 2761
2620 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2762 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2621 if (slab_reclaimable > zone->min_slab_pages) { 2763 if (nr_slab_pages0 > zone->min_slab_pages) {
2622 /* 2764 /*
2623 * shrink_slab() does not currently allow us to determine how 2765 * shrink_slab() does not currently allow us to determine how
2624 * many pages were freed in this zone. So we take the current 2766 * many pages were freed in this zone. So we take the current
@@ -2629,17 +2771,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2629 * Note that shrink_slab will free memory on all zones and may 2771 * Note that shrink_slab will free memory on all zones and may
2630 * take a long time. 2772 * take a long time.
2631 */ 2773 */
2632 while (shrink_slab(sc.nr_scanned, gfp_mask, order) && 2774 for (;;) {
2633 zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 2775 unsigned long lru_pages = zone_reclaimable_pages(zone);
2634 slab_reclaimable - nr_pages) 2776
2635 ; 2777 /* No reclaimable slab or very low memory pressure */
2778 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
2779 break;
2780
2781 /* Freed enough memory */
2782 nr_slab_pages1 = zone_page_state(zone,
2783 NR_SLAB_RECLAIMABLE);
2784 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
2785 break;
2786 }
2636 2787
2637 /* 2788 /*
2638 * Update nr_reclaimed by the number of slab pages we 2789 * Update nr_reclaimed by the number of slab pages we
2639 * reclaimed from this zone. 2790 * reclaimed from this zone.
2640 */ 2791 */
2641 sc.nr_reclaimed += slab_reclaimable - 2792 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2642 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2793 if (nr_slab_pages1 < nr_slab_pages0)
2794 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
2643 } 2795 }
2644 2796
2645 p->reclaim_state = NULL; 2797 p->reclaim_state = NULL;
@@ -2898,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
2898 return 0; 3050 return 0;
2899} 3051}
2900 3052
3053#ifdef CONFIG_NUMA
2901/* 3054/*
2902 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3055 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2903 * a specified node's per zone unevictable lists for evictable pages. 3056 * a specified node's per zone unevictable lists for evictable pages.
@@ -2944,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node)
2944{ 3097{
2945 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3098 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2946} 3099}
2947 3100#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941d4e7..cd2e42be7b6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,19 +17,21 @@
17#include <linux/vmstat.h> 17#include <linux/vmstat.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/math64.h> 19#include <linux/math64.h>
20#include <linux/writeback.h>
21#include <linux/compaction.h>
20 22
21#ifdef CONFIG_VM_EVENT_COUNTERS 23#ifdef CONFIG_VM_EVENT_COUNTERS
22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
23EXPORT_PER_CPU_SYMBOL(vm_event_states); 25EXPORT_PER_CPU_SYMBOL(vm_event_states);
24 26
25static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) 27static void sum_vm_events(unsigned long *ret)
26{ 28{
27 int cpu; 29 int cpu;
28 int i; 30 int i;
29 31
30 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 32 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
31 33
32 for_each_cpu(cpu, cpumask) { 34 for_each_online_cpu(cpu) {
33 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 35 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
34 36
35 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 37 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -45,7 +47,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
45void all_vm_events(unsigned long *ret) 47void all_vm_events(unsigned long *ret)
46{ 48{
47 get_online_cpus(); 49 get_online_cpus();
48 sum_vm_events(ret, cpu_online_mask); 50 sum_vm_events(ret);
49 put_online_cpus(); 51 put_online_cpus();
50} 52}
51EXPORT_SYMBOL_GPL(all_vm_events); 53EXPORT_SYMBOL_GPL(all_vm_events);
@@ -138,11 +140,24 @@ static void refresh_zone_stat_thresholds(void)
138 int threshold; 140 int threshold;
139 141
140 for_each_populated_zone(zone) { 142 for_each_populated_zone(zone) {
143 unsigned long max_drift, tolerate_drift;
144
141 threshold = calculate_threshold(zone); 145 threshold = calculate_threshold(zone);
142 146
143 for_each_online_cpu(cpu) 147 for_each_online_cpu(cpu)
144 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 148 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
145 = threshold; 149 = threshold;
150
151 /*
152 * Only set percpu_drift_mark if there is a danger that
153 * NR_FREE_PAGES reports the low watermark is ok when in fact
154 * the min watermark could be breached by an allocation
155 */
156 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
157 max_drift = num_online_cpus() * threshold;
158 if (max_drift > tolerate_drift)
159 zone->percpu_drift_mark = high_wmark_pages(zone) +
160 max_drift;
146 } 161 }
147} 162}
148 163
@@ -381,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
381#endif 396#endif
382 397
383#ifdef CONFIG_COMPACTION 398#ifdef CONFIG_COMPACTION
399
384struct contig_page_info { 400struct contig_page_info {
385 unsigned long free_pages; 401 unsigned long free_pages;
386 unsigned long free_blocks_total; 402 unsigned long free_blocks_total;
@@ -732,6 +748,11 @@ static const char * const vmstat_text[] = {
732 "nr_isolated_anon", 748 "nr_isolated_anon",
733 "nr_isolated_file", 749 "nr_isolated_file",
734 "nr_shmem", 750 "nr_shmem",
751 "nr_dirtied",
752 "nr_written",
753 "nr_dirty_threshold",
754 "nr_dirty_background_threshold",
755
735#ifdef CONFIG_NUMA 756#ifdef CONFIG_NUMA
736 "numa_hit", 757 "numa_hit",
737 "numa_miss", 758 "numa_miss",
@@ -813,7 +834,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
813 "\n scanned %lu" 834 "\n scanned %lu"
814 "\n spanned %lu" 835 "\n spanned %lu"
815 "\n present %lu", 836 "\n present %lu",
816 zone_page_state(zone, NR_FREE_PAGES), 837 zone_nr_free_pages(zone),
817 min_wmark_pages(zone), 838 min_wmark_pages(zone),
818 low_wmark_pages(zone), 839 low_wmark_pages(zone),
819 high_wmark_pages(zone), 840 high_wmark_pages(zone),
@@ -853,11 +874,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
853 } 874 }
854 seq_printf(m, 875 seq_printf(m,
855 "\n all_unreclaimable: %u" 876 "\n all_unreclaimable: %u"
856 "\n prev_priority: %i"
857 "\n start_pfn: %lu" 877 "\n start_pfn: %lu"
858 "\n inactive_ratio: %u", 878 "\n inactive_ratio: %u",
859 zone->all_unreclaimable, 879 zone->all_unreclaimable,
860 zone->prev_priority,
861 zone->zone_start_pfn, 880 zone->zone_start_pfn,
862 zone->inactive_ratio); 881 zone->inactive_ratio);
863 seq_putc(m, '\n'); 882 seq_putc(m, '\n');
@@ -893,36 +912,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
893 .release = seq_release, 912 .release = seq_release,
894}; 913};
895 914
915enum writeback_stat_item {
916 NR_DIRTY_THRESHOLD,
917 NR_DIRTY_BG_THRESHOLD,
918 NR_VM_WRITEBACK_STAT_ITEMS,
919};
920
896static void *vmstat_start(struct seq_file *m, loff_t *pos) 921static void *vmstat_start(struct seq_file *m, loff_t *pos)
897{ 922{
898 unsigned long *v; 923 unsigned long *v;
899#ifdef CONFIG_VM_EVENT_COUNTERS 924 int i, stat_items_size;
900 unsigned long *e;
901#endif
902 int i;
903 925
904 if (*pos >= ARRAY_SIZE(vmstat_text)) 926 if (*pos >= ARRAY_SIZE(vmstat_text))
905 return NULL; 927 return NULL;
928 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
929 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
906 930
907#ifdef CONFIG_VM_EVENT_COUNTERS 931#ifdef CONFIG_VM_EVENT_COUNTERS
908 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) 932 stat_items_size += sizeof(struct vm_event_state);
909 + sizeof(struct vm_event_state), GFP_KERNEL);
910#else
911 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
912 GFP_KERNEL);
913#endif 933#endif
934
935 v = kmalloc(stat_items_size, GFP_KERNEL);
914 m->private = v; 936 m->private = v;
915 if (!v) 937 if (!v)
916 return ERR_PTR(-ENOMEM); 938 return ERR_PTR(-ENOMEM);
917 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 939 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
918 v[i] = global_page_state(i); 940 v[i] = global_page_state(i);
941 v += NR_VM_ZONE_STAT_ITEMS;
942
943 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
944 v + NR_DIRTY_THRESHOLD);
945 v += NR_VM_WRITEBACK_STAT_ITEMS;
946
919#ifdef CONFIG_VM_EVENT_COUNTERS 947#ifdef CONFIG_VM_EVENT_COUNTERS
920 e = v + NR_VM_ZONE_STAT_ITEMS; 948 all_vm_events(v);
921 all_vm_events(e); 949 v[PGPGIN] /= 2; /* sectors -> kbytes */
922 e[PGPGIN] /= 2; /* sectors -> kbytes */ 950 v[PGPGOUT] /= 2;
923 e[PGPGOUT] /= 2;
924#endif 951#endif
925 return v + *pos; 952 return m->private + *pos;
926} 953}
927 954
928static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 955static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -1000,6 +1027,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1000 switch (action) { 1027 switch (action) {
1001 case CPU_ONLINE: 1028 case CPU_ONLINE:
1002 case CPU_ONLINE_FROZEN: 1029 case CPU_ONLINE_FROZEN:
1030 refresh_zone_stat_thresholds();
1003 start_cpu_timer(cpu); 1031 start_cpu_timer(cpu);
1004 node_set_state(cpu_to_node(cpu), N_CPU); 1032 node_set_state(cpu_to_node(cpu), N_CPU);
1005 break; 1033 break;