aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c448
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hugetlb.c110
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c71
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c462
-rw-r--r--mm/memory-failure.c120
-rw-r--r--mm/memory.c51
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mlock.c19
-rw-r--r--mm/mmap.c65
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c687
-rw-r--r--mm/page-writeback.c255
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/rmap.c186
-rw-r--r--mm/shmem.c139
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swapfile.c100
-rw-r--r--mm/truncate.c38
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c11
-rw-r--r--mm/vmscan.c548
-rw-r--r--mm/vmstat.c8
29 files changed, 2185 insertions, 1410 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f9fd3dd3916b..eaa4a5bbe063 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/device.h> 12#include <linux/device.h>
13#include <trace/events/writeback.h>
13 14
14static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
15 16
@@ -49,8 +50,6 @@ static struct timer_list sync_supers_timer;
49static int bdi_sync_supers(void *); 50static int bdi_sync_supers(void *);
50static void sync_supers_timer_fn(unsigned long); 51static void sync_supers_timer_fn(unsigned long);
51 52
52static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
53
54#ifdef CONFIG_DEBUG_FS 53#ifdef CONFIG_DEBUG_FS
55#include <linux/debugfs.h> 54#include <linux/debugfs.h>
56#include <linux/seq_file.h> 55#include <linux/seq_file.h>
@@ -65,31 +64,25 @@ static void bdi_debug_init(void)
65static int bdi_debug_stats_show(struct seq_file *m, void *v) 64static int bdi_debug_stats_show(struct seq_file *m, void *v)
66{ 65{
67 struct backing_dev_info *bdi = m->private; 66 struct backing_dev_info *bdi = m->private;
68 struct bdi_writeback *wb; 67 struct bdi_writeback *wb = &bdi->wb;
69 unsigned long background_thresh; 68 unsigned long background_thresh;
70 unsigned long dirty_thresh; 69 unsigned long dirty_thresh;
71 unsigned long bdi_thresh; 70 unsigned long bdi_thresh;
72 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; 71 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
73 struct inode *inode; 72 struct inode *inode;
74 73
75 /*
76 * inode lock is enough here, the bdi->wb_list is protected by
77 * RCU on the reader side
78 */
79 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 74 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
80 spin_lock(&inode_lock); 75 spin_lock(&inode_lock);
81 list_for_each_entry(wb, &bdi->wb_list, list) { 76 list_for_each_entry(inode, &wb->b_dirty, i_list)
82 nr_wb++; 77 nr_dirty++;
83 list_for_each_entry(inode, &wb->b_dirty, i_list) 78 list_for_each_entry(inode, &wb->b_io, i_list)
84 nr_dirty++; 79 nr_io++;
85 list_for_each_entry(inode, &wb->b_io, i_list) 80 list_for_each_entry(inode, &wb->b_more_io, i_list)
86 nr_io++; 81 nr_more_io++;
87 list_for_each_entry(inode, &wb->b_more_io, i_list)
88 nr_more_io++;
89 }
90 spin_unlock(&inode_lock); 82 spin_unlock(&inode_lock);
91 83
92 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 84 global_dirty_limits(&background_thresh, &dirty_thresh);
85 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
93 86
94#define K(x) ((x) << (PAGE_SHIFT - 10)) 87#define K(x) ((x) << (PAGE_SHIFT - 10))
95 seq_printf(m, 88 seq_printf(m,
@@ -98,19 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
98 "BdiDirtyThresh: %8lu kB\n" 91 "BdiDirtyThresh: %8lu kB\n"
99 "DirtyThresh: %8lu kB\n" 92 "DirtyThresh: %8lu kB\n"
100 "BackgroundThresh: %8lu kB\n" 93 "BackgroundThresh: %8lu kB\n"
101 "WritebackThreads: %8lu\n"
102 "b_dirty: %8lu\n" 94 "b_dirty: %8lu\n"
103 "b_io: %8lu\n" 95 "b_io: %8lu\n"
104 "b_more_io: %8lu\n" 96 "b_more_io: %8lu\n"
105 "bdi_list: %8u\n" 97 "bdi_list: %8u\n"
106 "state: %8lx\n" 98 "state: %8lx\n",
107 "wb_list: %8u\n",
108 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 99 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
109 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 100 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
110 K(bdi_thresh), K(dirty_thresh), 101 K(bdi_thresh), K(dirty_thresh),
111 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, 102 K(background_thresh), nr_dirty, nr_io, nr_more_io,
112 !list_empty(&bdi->bdi_list), bdi->state, 103 !list_empty(&bdi->bdi_list), bdi->state);
113 !list_empty(&bdi->wb_list));
114#undef K 104#undef K
115 105
116 return 0; 106 return 0;
@@ -247,7 +237,6 @@ static int __init default_bdi_init(void)
247 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); 237 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
248 BUG_ON(IS_ERR(sync_supers_tsk)); 238 BUG_ON(IS_ERR(sync_supers_tsk));
249 239
250 init_timer(&sync_supers_timer);
251 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); 240 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
252 bdi_arm_supers_timer(); 241 bdi_arm_supers_timer();
253 242
@@ -259,77 +248,6 @@ static int __init default_bdi_init(void)
259} 248}
260subsys_initcall(default_bdi_init); 249subsys_initcall(default_bdi_init);
261 250
262static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
263{
264 memset(wb, 0, sizeof(*wb));
265
266 wb->bdi = bdi;
267 wb->last_old_flush = jiffies;
268 INIT_LIST_HEAD(&wb->b_dirty);
269 INIT_LIST_HEAD(&wb->b_io);
270 INIT_LIST_HEAD(&wb->b_more_io);
271}
272
273static void bdi_task_init(struct backing_dev_info *bdi,
274 struct bdi_writeback *wb)
275{
276 struct task_struct *tsk = current;
277
278 spin_lock(&bdi->wb_lock);
279 list_add_tail_rcu(&wb->list, &bdi->wb_list);
280 spin_unlock(&bdi->wb_lock);
281
282 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
283 set_freezable();
284
285 /*
286 * Our parent may run at a different priority, just set us to normal
287 */
288 set_user_nice(tsk, 0);
289}
290
291static int bdi_start_fn(void *ptr)
292{
293 struct bdi_writeback *wb = ptr;
294 struct backing_dev_info *bdi = wb->bdi;
295 int ret;
296
297 /*
298 * Add us to the active bdi_list
299 */
300 spin_lock_bh(&bdi_lock);
301 list_add_rcu(&bdi->bdi_list, &bdi_list);
302 spin_unlock_bh(&bdi_lock);
303
304 bdi_task_init(bdi, wb);
305
306 /*
307 * Clear pending bit and wakeup anybody waiting to tear us down
308 */
309 clear_bit(BDI_pending, &bdi->state);
310 smp_mb__after_clear_bit();
311 wake_up_bit(&bdi->state, BDI_pending);
312
313 ret = bdi_writeback_task(wb);
314
315 /*
316 * Remove us from the list
317 */
318 spin_lock(&bdi->wb_lock);
319 list_del_rcu(&wb->list);
320 spin_unlock(&bdi->wb_lock);
321
322 /*
323 * Flush any work that raced with us exiting. No new work
324 * will be added, since this bdi isn't discoverable anymore.
325 */
326 if (!list_empty(&bdi->work_list))
327 wb_do_writeback(wb, 1);
328
329 wb->task = NULL;
330 return ret;
331}
332
333int bdi_has_dirty_io(struct backing_dev_info *bdi) 251int bdi_has_dirty_io(struct backing_dev_info *bdi)
334{ 252{
335 return wb_has_dirty_io(&bdi->wb); 253 return wb_has_dirty_io(&bdi->wb);
@@ -348,10 +266,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
348} 266}
349 267
350/* 268/*
351 * kupdated() used to do this. We cannot do it from the bdi_forker_task() 269 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
352 * or we risk deadlocking on ->s_umount. The longer term solution would be 270 * or we risk deadlocking on ->s_umount. The longer term solution would be
353 * to implement sync_supers_bdi() or similar and simply do it from the 271 * to implement sync_supers_bdi() or similar and simply do it from the
354 * bdi writeback tasks individually. 272 * bdi writeback thread individually.
355 */ 273 */
356static int bdi_sync_supers(void *unused) 274static int bdi_sync_supers(void *unused)
357{ 275{
@@ -387,144 +305,198 @@ static void sync_supers_timer_fn(unsigned long unused)
387 bdi_arm_supers_timer(); 305 bdi_arm_supers_timer();
388} 306}
389 307
390static int bdi_forker_task(void *ptr) 308static void wakeup_timer_fn(unsigned long data)
309{
310 struct backing_dev_info *bdi = (struct backing_dev_info *)data;
311
312 spin_lock_bh(&bdi->wb_lock);
313 if (bdi->wb.task) {
314 trace_writeback_wake_thread(bdi);
315 wake_up_process(bdi->wb.task);
316 } else {
317 /*
318 * When bdi tasks are inactive for long time, they are killed.
319 * In this case we have to wake-up the forker thread which
320 * should create and run the bdi thread.
321 */
322 trace_writeback_wake_forker_thread(bdi);
323 wake_up_process(default_backing_dev_info.wb.task);
324 }
325 spin_unlock_bh(&bdi->wb_lock);
326}
327
328/*
329 * This function is used when the first inode for this bdi is marked dirty. It
330 * wakes-up the corresponding bdi thread which should then take care of the
331 * periodic background write-out of dirty inodes. Since the write-out would
332 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
333 * set up a timer which wakes the bdi thread up later.
334 *
335 * Note, we wouldn't bother setting up the timer, but this function is on the
336 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
337 * by delaying the wake-up.
338 */
339void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
340{
341 unsigned long timeout;
342
343 timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
344 mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
345}
346
347/*
348 * Calculate the longest interval (jiffies) bdi threads are allowed to be
349 * inactive.
350 */
351static unsigned long bdi_longest_inactive(void)
352{
353 unsigned long interval;
354
355 interval = msecs_to_jiffies(dirty_writeback_interval * 10);
356 return max(5UL * 60 * HZ, interval);
357}
358
359static int bdi_forker_thread(void *ptr)
391{ 360{
392 struct bdi_writeback *me = ptr; 361 struct bdi_writeback *me = ptr;
393 362
394 bdi_task_init(me->bdi, me); 363 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
364 set_freezable();
365
366 /*
367 * Our parent may run at a different priority, just set us to normal
368 */
369 set_user_nice(current, 0);
395 370
396 for (;;) { 371 for (;;) {
397 struct backing_dev_info *bdi, *tmp; 372 struct task_struct *task = NULL;
398 struct bdi_writeback *wb; 373 struct backing_dev_info *bdi;
374 enum {
375 NO_ACTION, /* Nothing to do */
376 FORK_THREAD, /* Fork bdi thread */
377 KILL_THREAD, /* Kill inactive bdi thread */
378 } action = NO_ACTION;
399 379
400 /* 380 /*
401 * Temporary measure, we want to make sure we don't see 381 * Temporary measure, we want to make sure we don't see
402 * dirty data on the default backing_dev_info 382 * dirty data on the default backing_dev_info
403 */ 383 */
404 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) 384 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
385 del_timer(&me->wakeup_timer);
405 wb_do_writeback(me, 0); 386 wb_do_writeback(me, 0);
387 }
406 388
407 spin_lock_bh(&bdi_lock); 389 spin_lock_bh(&bdi_lock);
390 set_current_state(TASK_INTERRUPTIBLE);
408 391
409 /* 392 list_for_each_entry(bdi, &bdi_list, bdi_list) {
410 * Check if any existing bdi's have dirty data without 393 bool have_dirty_io;
411 * a thread registered. If so, set that up. 394
412 */ 395 if (!bdi_cap_writeback_dirty(bdi) ||
413 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { 396 bdi_cap_flush_forker(bdi))
414 if (bdi->wb.task)
415 continue;
416 if (list_empty(&bdi->work_list) &&
417 !bdi_has_dirty_io(bdi))
418 continue; 397 continue;
419 398
420 bdi_add_default_flusher_task(bdi); 399 WARN(!test_bit(BDI_registered, &bdi->state),
421 } 400 "bdi %p/%s is not registered!\n", bdi, bdi->name);
422 401
423 set_current_state(TASK_INTERRUPTIBLE); 402 have_dirty_io = !list_empty(&bdi->work_list) ||
403 wb_has_dirty_io(&bdi->wb);
424 404
425 if (list_empty(&bdi_pending_list)) { 405 /*
426 unsigned long wait; 406 * If the bdi has work to do, but the thread does not
407 * exist - create it.
408 */
409 if (!bdi->wb.task && have_dirty_io) {
410 /*
411 * Set the pending bit - if someone will try to
412 * unregister this bdi - it'll wait on this bit.
413 */
414 set_bit(BDI_pending, &bdi->state);
415 action = FORK_THREAD;
416 break;
417 }
418
419 spin_lock(&bdi->wb_lock);
420
421 /*
422 * If there is no work to do and the bdi thread was
423 * inactive long enough - kill it. The wb_lock is taken
424 * to make sure no-one adds more work to this bdi and
425 * wakes the bdi thread up.
426 */
427 if (bdi->wb.task && !have_dirty_io &&
428 time_after(jiffies, bdi->wb.last_active +
429 bdi_longest_inactive())) {
430 task = bdi->wb.task;
431 bdi->wb.task = NULL;
432 spin_unlock(&bdi->wb_lock);
433 set_bit(BDI_pending, &bdi->state);
434 action = KILL_THREAD;
435 break;
436 }
437 spin_unlock(&bdi->wb_lock);
438 }
439 spin_unlock_bh(&bdi_lock);
427 440
428 spin_unlock_bh(&bdi_lock); 441 /* Keep working if default bdi still has things to do */
429 wait = msecs_to_jiffies(dirty_writeback_interval * 10); 442 if (!list_empty(&me->bdi->work_list))
430 if (wait) 443 __set_current_state(TASK_RUNNING);
431 schedule_timeout(wait); 444
445 switch (action) {
446 case FORK_THREAD:
447 __set_current_state(TASK_RUNNING);
448 task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
449 dev_name(bdi->dev));
450 if (IS_ERR(task)) {
451 /*
452 * If thread creation fails, force writeout of
453 * the bdi from the thread.
454 */
455 bdi_flush_io(bdi);
456 } else {
457 /*
458 * The spinlock makes sure we do not lose
459 * wake-ups when racing with 'bdi_queue_work()'.
460 */
461 spin_lock_bh(&bdi->wb_lock);
462 bdi->wb.task = task;
463 spin_unlock_bh(&bdi->wb_lock);
464 }
465 break;
466
467 case KILL_THREAD:
468 __set_current_state(TASK_RUNNING);
469 kthread_stop(task);
470 break;
471
472 case NO_ACTION:
473 if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
474 /*
475 * There are no dirty data. The only thing we
476 * should now care about is checking for
477 * inactive bdi threads and killing them. Thus,
478 * let's sleep for longer time, save energy and
479 * be friendly for battery-driven devices.
480 */
481 schedule_timeout(bdi_longest_inactive());
432 else 482 else
433 schedule(); 483 schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
434 try_to_freeze(); 484 try_to_freeze();
485 /* Back to the main loop */
435 continue; 486 continue;
436 } 487 }
437 488
438 __set_current_state(TASK_RUNNING);
439
440 /*
441 * This is our real job - check for pending entries in
442 * bdi_pending_list, and create the tasks that got added
443 */
444 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
445 bdi_list);
446 list_del_init(&bdi->bdi_list);
447 spin_unlock_bh(&bdi_lock);
448
449 wb = &bdi->wb;
450 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
451 dev_name(bdi->dev));
452 /* 489 /*
453 * If task creation fails, then readd the bdi to 490 * Clear pending bit and wakeup anybody waiting to tear us down.
454 * the pending list and force writeout of the bdi
455 * from this forker thread. That will free some memory
456 * and we can try again.
457 */ 491 */
458 if (IS_ERR(wb->task)) { 492 clear_bit(BDI_pending, &bdi->state);
459 wb->task = NULL; 493 smp_mb__after_clear_bit();
460 494 wake_up_bit(&bdi->state, BDI_pending);
461 /*
462 * Add this 'bdi' to the back, so we get
463 * a chance to flush other bdi's to free
464 * memory.
465 */
466 spin_lock_bh(&bdi_lock);
467 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
468 spin_unlock_bh(&bdi_lock);
469
470 bdi_flush_io(bdi);
471 }
472 } 495 }
473 496
474 return 0; 497 return 0;
475} 498}
476 499
477static void bdi_add_to_pending(struct rcu_head *head)
478{
479 struct backing_dev_info *bdi;
480
481 bdi = container_of(head, struct backing_dev_info, rcu_head);
482 INIT_LIST_HEAD(&bdi->bdi_list);
483
484 spin_lock(&bdi_lock);
485 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
486 spin_unlock(&bdi_lock);
487
488 /*
489 * We are now on the pending list, wake up bdi_forker_task()
490 * to finish the job and add us back to the active bdi_list
491 */
492 wake_up_process(default_backing_dev_info.wb.task);
493}
494
495/*
496 * Add the default flusher task that gets created for any bdi
497 * that has dirty data pending writeout
498 */
499void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
500{
501 if (!bdi_cap_writeback_dirty(bdi))
502 return;
503
504 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
505 printk(KERN_ERR "bdi %p/%s is not registered!\n",
506 bdi, bdi->name);
507 return;
508 }
509
510 /*
511 * Check with the helper whether to proceed adding a task. Will only
512 * abort if we two or more simultanous calls to
513 * bdi_add_default_flusher_task() occured, further additions will block
514 * waiting for previous additions to finish.
515 */
516 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
517 list_del_rcu(&bdi->bdi_list);
518
519 /*
520 * We must wait for the current RCU period to end before
521 * moving to the pending list. So schedule that operation
522 * from an RCU callback.
523 */
524 call_rcu(&bdi->rcu_head, bdi_add_to_pending);
525 }
526}
527
528/* 500/*
529 * Remove bdi from bdi_list, and ensure that it is no longer visible 501 * Remove bdi from bdi_list, and ensure that it is no longer visible
530 */ 502 */
@@ -541,23 +513,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
541 const char *fmt, ...) 513 const char *fmt, ...)
542{ 514{
543 va_list args; 515 va_list args;
544 int ret = 0;
545 struct device *dev; 516 struct device *dev;
546 517
547 if (bdi->dev) /* The driver needs to use separate queues per device */ 518 if (bdi->dev) /* The driver needs to use separate queues per device */
548 goto exit; 519 return 0;
549 520
550 va_start(args, fmt); 521 va_start(args, fmt);
551 dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); 522 dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
552 va_end(args); 523 va_end(args);
553 if (IS_ERR(dev)) { 524 if (IS_ERR(dev))
554 ret = PTR_ERR(dev); 525 return PTR_ERR(dev);
555 goto exit;
556 }
557
558 spin_lock_bh(&bdi_lock);
559 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
560 spin_unlock_bh(&bdi_lock);
561 526
562 bdi->dev = dev; 527 bdi->dev = dev;
563 528
@@ -569,21 +534,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
569 if (bdi_cap_flush_forker(bdi)) { 534 if (bdi_cap_flush_forker(bdi)) {
570 struct bdi_writeback *wb = &bdi->wb; 535 struct bdi_writeback *wb = &bdi->wb;
571 536
572 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", 537 wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
573 dev_name(dev)); 538 dev_name(dev));
574 if (IS_ERR(wb->task)) { 539 if (IS_ERR(wb->task))
575 wb->task = NULL; 540 return PTR_ERR(wb->task);
576 ret = -ENOMEM;
577
578 bdi_remove_from_list(bdi);
579 goto exit;
580 }
581 } 541 }
582 542
583 bdi_debug_register(bdi, dev_name(dev)); 543 bdi_debug_register(bdi, dev_name(dev));
584 set_bit(BDI_registered, &bdi->state); 544 set_bit(BDI_registered, &bdi->state);
585exit: 545
586 return ret; 546 spin_lock_bh(&bdi_lock);
547 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
548 spin_unlock_bh(&bdi_lock);
549
550 trace_writeback_bdi_register(bdi);
551 return 0;
587} 552}
588EXPORT_SYMBOL(bdi_register); 553EXPORT_SYMBOL(bdi_register);
589 554
@@ -598,31 +563,29 @@ EXPORT_SYMBOL(bdi_register_dev);
598 */ 563 */
599static void bdi_wb_shutdown(struct backing_dev_info *bdi) 564static void bdi_wb_shutdown(struct backing_dev_info *bdi)
600{ 565{
601 struct bdi_writeback *wb;
602
603 if (!bdi_cap_writeback_dirty(bdi)) 566 if (!bdi_cap_writeback_dirty(bdi))
604 return; 567 return;
605 568
606 /* 569 /*
607 * If setup is pending, wait for that to complete first 570 * Make sure nobody finds us on the bdi_list anymore
608 */ 571 */
609 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, 572 bdi_remove_from_list(bdi);
610 TASK_UNINTERRUPTIBLE);
611 573
612 /* 574 /*
613 * Make sure nobody finds us on the bdi_list anymore 575 * If setup is pending, wait for that to complete first
614 */ 576 */
615 bdi_remove_from_list(bdi); 577 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
578 TASK_UNINTERRUPTIBLE);
616 579
617 /* 580 /*
618 * Finally, kill the kernel threads. We don't need to be RCU 581 * Finally, kill the kernel thread. We don't need to be RCU
619 * safe anymore, since the bdi is gone from visibility. Force 582 * safe anymore, since the bdi is gone from visibility. Force
620 * unfreeze of the thread before calling kthread_stop(), otherwise 583 * unfreeze of the thread before calling kthread_stop(), otherwise
621 * it would never exet if it is currently stuck in the refrigerator. 584 * it would never exet if it is currently stuck in the refrigerator.
622 */ 585 */
623 list_for_each_entry(wb, &bdi->wb_list, list) { 586 if (bdi->wb.task) {
624 thaw_process(wb->task); 587 thaw_process(bdi->wb.task);
625 kthread_stop(wb->task); 588 kthread_stop(bdi->wb.task);
626 } 589 }
627} 590}
628 591
@@ -644,7 +607,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
644void bdi_unregister(struct backing_dev_info *bdi) 607void bdi_unregister(struct backing_dev_info *bdi)
645{ 608{
646 if (bdi->dev) { 609 if (bdi->dev) {
610 trace_writeback_bdi_unregister(bdi);
647 bdi_prune_sb(bdi); 611 bdi_prune_sb(bdi);
612 del_timer_sync(&bdi->wb.wakeup_timer);
648 613
649 if (!bdi_cap_flush_forker(bdi)) 614 if (!bdi_cap_flush_forker(bdi))
650 bdi_wb_shutdown(bdi); 615 bdi_wb_shutdown(bdi);
@@ -655,6 +620,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
655} 620}
656EXPORT_SYMBOL(bdi_unregister); 621EXPORT_SYMBOL(bdi_unregister);
657 622
623static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
624{
625 memset(wb, 0, sizeof(*wb));
626
627 wb->bdi = bdi;
628 wb->last_old_flush = jiffies;
629 INIT_LIST_HEAD(&wb->b_dirty);
630 INIT_LIST_HEAD(&wb->b_io);
631 INIT_LIST_HEAD(&wb->b_more_io);
632 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
633}
634
658int bdi_init(struct backing_dev_info *bdi) 635int bdi_init(struct backing_dev_info *bdi)
659{ 636{
660 int i, err; 637 int i, err;
@@ -666,7 +643,6 @@ int bdi_init(struct backing_dev_info *bdi)
666 bdi->max_prop_frac = PROP_FRAC_BASE; 643 bdi->max_prop_frac = PROP_FRAC_BASE;
667 spin_lock_init(&bdi->wb_lock); 644 spin_lock_init(&bdi->wb_lock);
668 INIT_LIST_HEAD(&bdi->bdi_list); 645 INIT_LIST_HEAD(&bdi->bdi_list);
669 INIT_LIST_HEAD(&bdi->wb_list);
670 INIT_LIST_HEAD(&bdi->work_list); 646 INIT_LIST_HEAD(&bdi->work_list);
671 647
672 bdi_wb_init(&bdi->wb, bdi); 648 bdi_wb_init(&bdi->wb, bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..3d4df44e4221 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
2238 2238
2239 do { 2239 do {
2240 struct page *page; 2240 struct page *page;
2241 pgoff_t index; /* Pagecache index for current page */
2242 unsigned long offset; /* Offset into pagecache page */ 2241 unsigned long offset; /* Offset into pagecache page */
2243 unsigned long bytes; /* Bytes to write to page */ 2242 unsigned long bytes; /* Bytes to write to page */
2244 size_t copied; /* Bytes copied from user */ 2243 size_t copied; /* Bytes copied from user */
2245 void *fsdata; 2244 void *fsdata;
2246 2245
2247 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2246 offset = (pos & (PAGE_CACHE_SIZE - 1));
2248 index = pos >> PAGE_CACHE_SHIFT;
2249 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2247 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2250 iov_iter_count(i)); 2248 iov_iter_count(i));
2251 2249
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..cc5be788a39f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rmap.h>
22#include <linux/swap.h>
23#include <linux/swapops.h>
21 24
22#include <asm/page.h> 25#include <asm/page.h>
23#include <asm/pgtable.h> 26#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220 (vma->vm_pgoff >> huge_page_order(h)); 223 (vma->vm_pgoff >> huge_page_order(h));
221} 224}
222 225
226pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
227 unsigned long address)
228{
229 return vma_hugecache_offset(hstate_vma(vma), vma, address);
230}
231
223/* 232/*
224 * Return the size of the pages allocated when backing a VMA. In the majority 233 * Return the size of the pages allocated when backing a VMA. In the majority
225 * cases this will be same size as used by the page table entries. 234 * cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
552 set_page_private(page, 0); 561 set_page_private(page, 0);
553 page->mapping = NULL; 562 page->mapping = NULL;
554 BUG_ON(page_count(page)); 563 BUG_ON(page_count(page));
564 BUG_ON(page_mapcount(page));
555 INIT_LIST_HEAD(&page->lru); 565 INIT_LIST_HEAD(&page->lru);
556 566
557 spin_lock(&hugetlb_lock); 567 spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
605 return dtor == free_huge_page; 615 return dtor == free_huge_page;
606} 616}
607 617
618EXPORT_SYMBOL_GPL(PageHuge);
619
608static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 620static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
609{ 621{
610 struct page *page; 622 struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2129 entry = huge_ptep_get(src_pte); 2141 entry = huge_ptep_get(src_pte);
2130 ptepage = pte_page(entry); 2142 ptepage = pte_page(entry);
2131 get_page(ptepage); 2143 get_page(ptepage);
2144 page_dup_rmap(ptepage);
2132 set_huge_pte_at(dst, addr, dst_pte, entry); 2145 set_huge_pte_at(dst, addr, dst_pte, entry);
2133 } 2146 }
2134 spin_unlock(&src->page_table_lock); 2147 spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
2140 return -ENOMEM; 2153 return -ENOMEM;
2141} 2154}
2142 2155
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{
2158 swp_entry_t swp;
2159
2160 if (huge_pte_none(pte) || pte_present(pte))
2161 return 0;
2162 swp = pte_to_swp_entry(pte);
2163 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
2164 return 1;
2165 } else
2166 return 0;
2167}
2168
2143void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2169void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2144 unsigned long end, struct page *ref_page) 2170 unsigned long end, struct page *ref_page)
2145{ 2171{
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2198 if (huge_pte_none(pte)) 2224 if (huge_pte_none(pte))
2199 continue; 2225 continue;
2200 2226
2227 /*
2228 * HWPoisoned hugepage is already unmapped and dropped reference
2229 */
2230 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2231 continue;
2232
2201 page = pte_page(pte); 2233 page = pte_page(pte);
2202 if (pte_dirty(pte)) 2234 if (pte_dirty(pte))
2203 set_page_dirty(page); 2235 set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2207 flush_tlb_range(vma, start, end); 2239 flush_tlb_range(vma, start, end);
2208 mmu_notifier_invalidate_range_end(mm, start, end); 2240 mmu_notifier_invalidate_range_end(mm, start, end);
2209 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2241 list_for_each_entry_safe(page, tmp, &page_list, lru) {
2242 page_remove_rmap(page);
2210 list_del(&page->lru); 2243 list_del(&page->lru);
2211 put_page(page); 2244 put_page(page);
2212 } 2245 }
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2272 return 1; 2305 return 1;
2273} 2306}
2274 2307
2308/*
2309 * Hugetlb_cow() should be called with page lock of the original hugepage held.
2310 */
2275static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2311static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2276 unsigned long address, pte_t *ptep, pte_t pte, 2312 unsigned long address, pte_t *ptep, pte_t pte,
2277 struct page *pagecache_page) 2313 struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2286retry_avoidcopy: 2322retry_avoidcopy:
2287 /* If no-one else is actually using this page, avoid the copy 2323 /* If no-one else is actually using this page, avoid the copy
2288 * and just make the page writable */ 2324 * and just make the page writable */
2289 avoidcopy = (page_count(old_page) == 1); 2325 avoidcopy = (page_mapcount(old_page) == 1);
2290 if (avoidcopy) { 2326 if (avoidcopy) {
2327 if (!trylock_page(old_page)) {
2328 if (PageAnon(old_page))
2329 page_move_anon_rmap(old_page, vma, address);
2330 } else
2331 unlock_page(old_page);
2291 set_huge_ptep_writable(vma, address, ptep); 2332 set_huge_ptep_writable(vma, address, ptep);
2292 return 0; 2333 return 0;
2293 } 2334 }
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
2338 return -PTR_ERR(new_page); 2379 return -PTR_ERR(new_page);
2339 } 2380 }
2340 2381
2382 /*
2383 * When the original hugepage is shared one, it does not have
2384 * anon_vma prepared.
2385 */
2386 if (unlikely(anon_vma_prepare(vma)))
2387 return VM_FAULT_OOM;
2388
2341 copy_huge_page(new_page, old_page, address, vma); 2389 copy_huge_page(new_page, old_page, address, vma);
2342 __SetPageUptodate(new_page); 2390 __SetPageUptodate(new_page);
2343 2391
@@ -2349,11 +2397,19 @@ retry_avoidcopy:
2349 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2397 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2350 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2398 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2351 /* Break COW */ 2399 /* Break COW */
2400 mmu_notifier_invalidate_range_start(mm,
2401 address & huge_page_mask(h),
2402 (address & huge_page_mask(h)) + huge_page_size(h));
2352 huge_ptep_clear_flush(vma, address, ptep); 2403 huge_ptep_clear_flush(vma, address, ptep);
2353 set_huge_pte_at(mm, address, ptep, 2404 set_huge_pte_at(mm, address, ptep,
2354 make_huge_pte(vma, new_page, 1)); 2405 make_huge_pte(vma, new_page, 1));
2406 page_remove_rmap(old_page);
2407 hugepage_add_anon_rmap(new_page, vma, address);
2355 /* Make the old page be freed below */ 2408 /* Make the old page be freed below */
2356 new_page = old_page; 2409 new_page = old_page;
2410 mmu_notifier_invalidate_range_end(mm,
2411 address & huge_page_mask(h),
2412 (address & huge_page_mask(h)) + huge_page_size(h));
2357 } 2413 }
2358 page_cache_release(new_page); 2414 page_cache_release(new_page);
2359 page_cache_release(old_page); 2415 page_cache_release(old_page);
@@ -2452,10 +2508,29 @@ retry:
2452 spin_lock(&inode->i_lock); 2508 spin_lock(&inode->i_lock);
2453 inode->i_blocks += blocks_per_huge_page(h); 2509 inode->i_blocks += blocks_per_huge_page(h);
2454 spin_unlock(&inode->i_lock); 2510 spin_unlock(&inode->i_lock);
2511 page_dup_rmap(page);
2455 } else { 2512 } else {
2456 lock_page(page); 2513 lock_page(page);
2457 page->mapping = HUGETLB_POISON; 2514 if (unlikely(anon_vma_prepare(vma))) {
2515 ret = VM_FAULT_OOM;
2516 goto backout_unlocked;
2517 }
2518 hugepage_add_new_anon_rmap(page, vma, address);
2458 } 2519 }
2520 } else {
2521 page_dup_rmap(page);
2522 }
2523
2524 /*
2525 * Since memory error handler replaces pte into hwpoison swap entry
2526 * at the time of error handling, a process which reserved but not have
2527 * the mapping to the error hugepage does not have hwpoison swap entry.
2528 * So we need to block accesses from such a process by checking
2529 * PG_hwpoison bit here.
2530 */
2531 if (unlikely(PageHWPoison(page))) {
2532 ret = VM_FAULT_HWPOISON;
2533 goto backout_unlocked;
2459 } 2534 }
2460 2535
2461 /* 2536 /*
@@ -2507,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2507 pte_t *ptep; 2582 pte_t *ptep;
2508 pte_t entry; 2583 pte_t entry;
2509 int ret; 2584 int ret;
2585 struct page *page = NULL;
2510 struct page *pagecache_page = NULL; 2586 struct page *pagecache_page = NULL;
2511 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2587 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2512 struct hstate *h = hstate_vma(vma); 2588 struct hstate *h = hstate_vma(vma);
2513 2589
2590 ptep = huge_pte_offset(mm, address);
2591 if (ptep) {
2592 entry = huge_ptep_get(ptep);
2593 if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2594 return VM_FAULT_HWPOISON;
2595 }
2596
2514 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2597 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2515 if (!ptep) 2598 if (!ptep)
2516 return VM_FAULT_OOM; 2599 return VM_FAULT_OOM;
@@ -2548,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2548 vma, address); 2631 vma, address);
2549 } 2632 }
2550 2633
2634 if (!pagecache_page) {
2635 page = pte_page(entry);
2636 lock_page(page);
2637 }
2638
2551 spin_lock(&mm->page_table_lock); 2639 spin_lock(&mm->page_table_lock);
2552 /* Check for a racing update before calling hugetlb_cow */ 2640 /* Check for a racing update before calling hugetlb_cow */
2553 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2641 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2573,6 +2661,8 @@ out_page_table_lock:
2573 if (pagecache_page) { 2661 if (pagecache_page) {
2574 unlock_page(pagecache_page); 2662 unlock_page(pagecache_page);
2575 put_page(pagecache_page); 2663 put_page(pagecache_page);
2664 } else {
2665 unlock_page(page);
2576 } 2666 }
2577 2667
2578out_mutex: 2668out_mutex:
@@ -2785,3 +2875,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2785 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2875 hugetlb_put_quota(inode->i_mapping, (chg - freed));
2786 hugetlb_acct_memory(h, -(chg - freed)); 2876 hugetlb_acct_memory(h, -(chg - freed));
2787} 2877}
2878
2879/*
2880 * This function is called from memory failure code.
2881 * Assume the caller holds page lock of the head page.
2882 */
2883void __isolate_hwpoisoned_huge_page(struct page *hpage)
2884{
2885 struct hstate *h = page_hstate(hpage);
2886 int nid = page_to_nid(hpage);
2887
2888 spin_lock(&hugetlb_lock);
2889 list_del(&hpage->lru);
2890 h->free_huge_pages--;
2891 h->free_huge_pages_node[nid]--;
2892 spin_unlock(&hugetlb_lock);
2893}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1f..0948f1072d6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h> 6#include <linux/swap.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/hugetlb.h>
8#include "internal.h" 9#include "internal.h"
9 10
10static struct dentry *hwpoison_dir; 11static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
13{ 14{
14 unsigned long pfn = val; 15 unsigned long pfn = val;
15 struct page *p; 16 struct page *p;
17 struct page *hpage;
16 int err; 18 int err;
17 19
18 if (!capable(CAP_SYS_ADMIN)) 20 if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
24 return -ENXIO; 26 return -ENXIO;
25 27
26 p = pfn_to_page(pfn); 28 p = pfn_to_page(pfn);
29 hpage = compound_head(p);
27 /* 30 /*
28 * This implies unable to support free buddy pages. 31 * This implies unable to support free buddy pages.
29 */ 32 */
30 if (!get_page_unless_zero(p)) 33 if (!get_page_unless_zero(hpage))
31 return 0; 34 return 0;
32 35
33 if (!PageLRU(p)) 36 if (!PageLRU(p) && !PageHuge(p))
34 shake_page(p, 0); 37 shake_page(p, 0);
35 /* 38 /*
36 * This implies unable to support non-LRU pages. 39 * This implies unable to support non-LRU pages.
37 */ 40 */
38 if (!PageLRU(p)) 41 if (!PageLRU(p) && !PageHuge(p))
39 return 0; 42 return 0;
40 43
41 /* 44 /*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
44 * We temporarily take page lock for try_get_mem_cgroup_from_page(). 47 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock. 48 * __memory_failure() will redo the check reliably inside page lock.
46 */ 49 */
47 lock_page(p); 50 lock_page(hpage);
48 err = hwpoison_filter(p); 51 err = hwpoison_filter(hpage);
49 unlock_page(p); 52 unlock_page(hpage);
50 if (err) 53 if (err)
51 return 0; 54 return 0;
52 55
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da9668..1d29cdfe8ebb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
7 7
8#include <asm/atomic.h> 8#include <asm/atomic.h>
9#include <asm/pgtable.h> 9#include <asm/pgtable.h>
10#include <asm/mmu.h>
11
12#ifndef INIT_MM_CONTEXT
13#define INIT_MM_CONTEXT(name)
14#endif
10 15
11struct mm_struct init_mm = { 16struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT, 17 .mm_rb = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 22 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 23 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL, 24 .cpu_vm_mask = CPU_MASK_ALL,
25 INIT_MM_CONTEXT(init_mm)
20}; 26};
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2c0d032ac898..bd9bc214091b 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -211,6 +211,9 @@ static signed long jiffies_scan_wait;
211static int kmemleak_stack_scan = 1; 211static int kmemleak_stack_scan = 1;
212/* protects the memory scanning, parameters and debug/kmemleak file access */ 212/* protects the memory scanning, parameters and debug/kmemleak file access */
213static DEFINE_MUTEX(scan_mutex); 213static DEFINE_MUTEX(scan_mutex);
214/* setting kmemleak=on, will set this var, skipping the disable */
215static int kmemleak_skip_disable;
216
214 217
215/* 218/*
216 * Early object allocation/freeing logging. Kmemleak is initialized after the 219 * Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -398,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
398 object = prio_tree_entry(node, struct kmemleak_object, 401 object = prio_tree_entry(node, struct kmemleak_object,
399 tree_node); 402 tree_node);
400 if (!alias && object->pointer != ptr) { 403 if (!alias && object->pointer != ptr) {
401 kmemleak_warn("Found object by alias"); 404 pr_warning("Found object by alias at 0x%08lx\n", ptr);
405 dump_stack();
406 dump_object_info(object);
402 object = NULL; 407 object = NULL;
403 } 408 }
404 } else 409 } else
@@ -695,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color)
695} 700}
696 701
697/* 702/*
698 * Make a object permanently as gray-colored so that it can no longer be 703 * Mark an object permanently as gray-colored so that it can no longer be
699 * reported as a leak. This is used in general to mark a false positive. 704 * reported as a leak. This is used in general to mark a false positive.
700 */ 705 */
701static void make_gray_object(unsigned long ptr) 706static void make_gray_object(unsigned long ptr)
@@ -838,10 +843,19 @@ out:
838 rcu_read_unlock(); 843 rcu_read_unlock();
839} 844}
840 845
841/* 846/**
842 * Memory allocation function callback. This function is called from the 847 * kmemleak_alloc - register a newly allocated object
843 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 848 * @ptr: pointer to beginning of the object
844 * vmalloc etc.). 849 * @size: size of the object
850 * @min_count: minimum number of references to this object. If during memory
851 * scanning a number of references less than @min_count is found,
852 * the object is reported as a memory leak. If @min_count is 0,
853 * the object is never reported as a leak. If @min_count is -1,
854 * the object is ignored (not scanned and not reported as a leak)
855 * @gfp: kmalloc() flags used for kmemleak internal memory allocations
856 *
857 * This function is called from the kernel allocators when a new object
858 * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
845 */ 859 */
846void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, 860void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
847 gfp_t gfp) 861 gfp_t gfp)
@@ -855,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
855} 869}
856EXPORT_SYMBOL_GPL(kmemleak_alloc); 870EXPORT_SYMBOL_GPL(kmemleak_alloc);
857 871
858/* 872/**
859 * Memory freeing function callback. This function is called from the kernel 873 * kmemleak_free - unregister a previously registered object
860 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 874 * @ptr: pointer to beginning of the object
875 *
876 * This function is called from the kernel allocators when an object (memory
877 * block) is freed (kmem_cache_free, kfree, vfree etc.).
861 */ 878 */
862void __ref kmemleak_free(const void *ptr) 879void __ref kmemleak_free(const void *ptr)
863{ 880{
@@ -870,9 +887,14 @@ void __ref kmemleak_free(const void *ptr)
870} 887}
871EXPORT_SYMBOL_GPL(kmemleak_free); 888EXPORT_SYMBOL_GPL(kmemleak_free);
872 889
873/* 890/**
874 * Partial memory freeing function callback. This function is usually called 891 * kmemleak_free_part - partially unregister a previously registered object
875 * from bootmem allocator when (part of) a memory block is freed. 892 * @ptr: pointer to the beginning or inside the object. This also
893 * represents the start of the range to be freed
894 * @size: size to be unregistered
895 *
896 * This function is called when only a part of a memory block is freed
897 * (usually from the bootmem allocator).
876 */ 898 */
877void __ref kmemleak_free_part(const void *ptr, size_t size) 899void __ref kmemleak_free_part(const void *ptr, size_t size)
878{ 900{
@@ -885,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
885} 907}
886EXPORT_SYMBOL_GPL(kmemleak_free_part); 908EXPORT_SYMBOL_GPL(kmemleak_free_part);
887 909
888/* 910/**
889 * Mark an already allocated memory block as a false positive. This will cause 911 * kmemleak_not_leak - mark an allocated object as false positive
890 * the block to no longer be reported as leak and always be scanned. 912 * @ptr: pointer to beginning of the object
913 *
914 * Calling this function on an object will cause the memory block to no longer
915 * be reported as leak and always be scanned.
891 */ 916 */
892void __ref kmemleak_not_leak(const void *ptr) 917void __ref kmemleak_not_leak(const void *ptr)
893{ 918{
@@ -900,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr)
900} 925}
901EXPORT_SYMBOL(kmemleak_not_leak); 926EXPORT_SYMBOL(kmemleak_not_leak);
902 927
903/* 928/**
904 * Ignore a memory block. This is usually done when it is known that the 929 * kmemleak_ignore - ignore an allocated object
905 * corresponding block is not a leak and does not contain any references to 930 * @ptr: pointer to beginning of the object
906 * other allocated memory blocks. 931 *
932 * Calling this function on an object will cause the memory block to be
933 * ignored (not scanned and not reported as a leak). This is usually done when
934 * it is known that the corresponding block is not a leak and does not contain
935 * any references to other allocated memory blocks.
907 */ 936 */
908void __ref kmemleak_ignore(const void *ptr) 937void __ref kmemleak_ignore(const void *ptr)
909{ 938{
@@ -916,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr)
916} 945}
917EXPORT_SYMBOL(kmemleak_ignore); 946EXPORT_SYMBOL(kmemleak_ignore);
918 947
919/* 948/**
920 * Limit the range to be scanned in an allocated memory block. 949 * kmemleak_scan_area - limit the range to be scanned in an allocated object
950 * @ptr: pointer to beginning or inside the object. This also
951 * represents the start of the scan area
952 * @size: size of the scan area
953 * @gfp: kmalloc() flags used for kmemleak internal memory allocations
954 *
955 * This function is used when it is known that only certain parts of an object
956 * contain references to other objects. Kmemleak will only scan these areas
957 * reducing the number false negatives.
921 */ 958 */
922void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) 959void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
923{ 960{
@@ -930,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
930} 967}
931EXPORT_SYMBOL(kmemleak_scan_area); 968EXPORT_SYMBOL(kmemleak_scan_area);
932 969
933/* 970/**
934 * Inform kmemleak not to scan the given memory block. 971 * kmemleak_no_scan - do not scan an allocated object
972 * @ptr: pointer to beginning of the object
973 *
974 * This function notifies kmemleak not to scan the given memory block. Useful
975 * in situations where it is known that the given object does not contain any
976 * references to other objects. Kmemleak will not scan such objects reducing
977 * the number of false negatives.
935 */ 978 */
936void __ref kmemleak_no_scan(const void *ptr) 979void __ref kmemleak_no_scan(const void *ptr)
937{ 980{
@@ -1602,7 +1645,9 @@ static int kmemleak_boot_config(char *str)
1602 return -EINVAL; 1645 return -EINVAL;
1603 if (strcmp(str, "off") == 0) 1646 if (strcmp(str, "off") == 0)
1604 kmemleak_disable(); 1647 kmemleak_disable();
1605 else if (strcmp(str, "on") != 0) 1648 else if (strcmp(str, "on") == 0)
1649 kmemleak_skip_disable = 1;
1650 else
1606 return -EINVAL; 1651 return -EINVAL;
1607 return 0; 1652 return 0;
1608} 1653}
@@ -1616,6 +1661,13 @@ void __init kmemleak_init(void)
1616 int i; 1661 int i;
1617 unsigned long flags; 1662 unsigned long flags;
1618 1663
1664#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
1665 if (!kmemleak_skip_disable) {
1666 kmemleak_disable();
1667 return;
1668 }
1669#endif
1670
1619 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); 1671 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
1620 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); 1672 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
1621 1673
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..e2ae00458320 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h>
36 37
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include "internal.h" 39#include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
153static struct rb_root root_stable_tree = RB_ROOT; 154static struct rb_root root_stable_tree = RB_ROOT;
154static struct rb_root root_unstable_tree = RB_ROOT; 155static struct rb_root root_unstable_tree = RB_ROOT;
155 156
156#define MM_SLOTS_HASH_HEADS 1024 157#define MM_SLOTS_HASH_SHIFT 10
157static struct hlist_head *mm_slots_hash; 158#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
159static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
158 160
159static struct mm_slot ksm_mm_head = { 161static struct mm_slot ksm_mm_head = {
160 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 162 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
269 kmem_cache_free(mm_slot_cache, mm_slot); 271 kmem_cache_free(mm_slot_cache, mm_slot);
270} 272}
271 273
272static int __init mm_slots_hash_init(void)
273{
274 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
275 GFP_KERNEL);
276 if (!mm_slots_hash)
277 return -ENOMEM;
278 return 0;
279}
280
281static void __init mm_slots_hash_free(void)
282{
283 kfree(mm_slots_hash);
284}
285
286static struct mm_slot *get_mm_slot(struct mm_struct *mm) 274static struct mm_slot *get_mm_slot(struct mm_struct *mm)
287{ 275{
288 struct mm_slot *mm_slot; 276 struct mm_slot *mm_slot;
289 struct hlist_head *bucket; 277 struct hlist_head *bucket;
290 struct hlist_node *node; 278 struct hlist_node *node;
291 279
292 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 280 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
293 % MM_SLOTS_HASH_HEADS];
294 hlist_for_each_entry(mm_slot, node, bucket, link) { 281 hlist_for_each_entry(mm_slot, node, bucket, link) {
295 if (mm == mm_slot->mm) 282 if (mm == mm_slot->mm)
296 return mm_slot; 283 return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
303{ 290{
304 struct hlist_head *bucket; 291 struct hlist_head *bucket;
305 292
306 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 293 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
307 % MM_SLOTS_HASH_HEADS];
308 mm_slot->mm = mm; 294 mm_slot->mm = mm;
309 hlist_add_head(&mm_slot->link, bucket); 295 hlist_add_head(&mm_slot->link, bucket);
310} 296}
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma) 304 struct anon_vma *anon_vma)
319{ 305{
320 rmap_item->anon_vma = anon_vma; 306 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->external_refcount); 307 get_anon_vma(anon_vma);
322} 308}
323 309
324static void drop_anon_vma(struct rmap_item *rmap_item) 310static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
325{ 311{
326 struct anon_vma *anon_vma = rmap_item->anon_vma; 312 struct anon_vma *anon_vma = rmap_item->anon_vma;
327 313
328 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { 314 drop_anon_vma(anon_vma);
329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock);
331 if (empty)
332 anon_vma_free(anon_vma);
333 }
334} 315}
335 316
336/* 317/*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
415 * It is not an accident that whenever we want to break COW 396 * It is not an accident that whenever we want to break COW
416 * to undo, we also need to drop a reference to the anon_vma. 397 * to undo, we also need to drop a reference to the anon_vma.
417 */ 398 */
418 drop_anon_vma(rmap_item); 399 ksm_drop_anon_vma(rmap_item);
419 400
420 down_read(&mm->mmap_sem); 401 down_read(&mm->mmap_sem);
421 if (ksm_test_exit(mm)) 402 if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
470 ksm_pages_sharing--; 451 ksm_pages_sharing--;
471 else 452 else
472 ksm_pages_shared--; 453 ksm_pages_shared--;
473 drop_anon_vma(rmap_item); 454 ksm_drop_anon_vma(rmap_item);
474 rmap_item->address &= PAGE_MASK; 455 rmap_item->address &= PAGE_MASK;
475 cond_resched(); 456 cond_resched();
476 } 457 }
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
558 else 539 else
559 ksm_pages_shared--; 540 ksm_pages_shared--;
560 541
561 drop_anon_vma(rmap_item); 542 ksm_drop_anon_vma(rmap_item);
562 rmap_item->address &= PAGE_MASK; 543 rmap_item->address &= PAGE_MASK;
563 544
564 } else if (rmap_item->address & UNSTABLE_FLAG) { 545 } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -1566,7 +1547,7 @@ again:
1566 struct anon_vma_chain *vmac; 1547 struct anon_vma_chain *vmac;
1567 struct vm_area_struct *vma; 1548 struct vm_area_struct *vma;
1568 1549
1569 spin_lock(&anon_vma->lock); 1550 anon_vma_lock(anon_vma);
1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1551 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma; 1552 vma = vmac->vma;
1572 if (rmap_item->address < vma->vm_start || 1553 if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1570,7 @@ again:
1589 if (!search_new_forks || !mapcount) 1570 if (!search_new_forks || !mapcount)
1590 break; 1571 break;
1591 } 1572 }
1592 spin_unlock(&anon_vma->lock); 1573 anon_vma_unlock(anon_vma);
1593 if (!mapcount) 1574 if (!mapcount)
1594 goto out; 1575 goto out;
1595 } 1576 }
@@ -1619,7 +1600,7 @@ again:
1619 struct anon_vma_chain *vmac; 1600 struct anon_vma_chain *vmac;
1620 struct vm_area_struct *vma; 1601 struct vm_area_struct *vma;
1621 1602
1622 spin_lock(&anon_vma->lock); 1603 anon_vma_lock(anon_vma);
1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1604 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma; 1605 vma = vmac->vma;
1625 if (rmap_item->address < vma->vm_start || 1606 if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1618,11 @@ again:
1637 ret = try_to_unmap_one(page, vma, 1618 ret = try_to_unmap_one(page, vma,
1638 rmap_item->address, flags); 1619 rmap_item->address, flags);
1639 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1620 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1640 spin_unlock(&anon_vma->lock); 1621 anon_vma_unlock(anon_vma);
1641 goto out; 1622 goto out;
1642 } 1623 }
1643 } 1624 }
1644 spin_unlock(&anon_vma->lock); 1625 anon_vma_unlock(anon_vma);
1645 } 1626 }
1646 if (!search_new_forks++) 1627 if (!search_new_forks++)
1647 goto again; 1628 goto again;
@@ -1671,7 +1652,7 @@ again:
1671 struct anon_vma_chain *vmac; 1652 struct anon_vma_chain *vmac;
1672 struct vm_area_struct *vma; 1653 struct vm_area_struct *vma;
1673 1654
1674 spin_lock(&anon_vma->lock); 1655 anon_vma_lock(anon_vma);
1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1656 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma; 1657 vma = vmac->vma;
1677 if (rmap_item->address < vma->vm_start || 1658 if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1669,11 @@ again:
1688 1669
1689 ret = rmap_one(page, vma, rmap_item->address, arg); 1670 ret = rmap_one(page, vma, rmap_item->address, arg);
1690 if (ret != SWAP_AGAIN) { 1671 if (ret != SWAP_AGAIN) {
1691 spin_unlock(&anon_vma->lock); 1672 anon_vma_unlock(anon_vma);
1692 goto out; 1673 goto out;
1693 } 1674 }
1694 } 1675 }
1695 spin_unlock(&anon_vma->lock); 1676 anon_vma_unlock(anon_vma);
1696 } 1677 }
1697 if (!search_new_forks++) 1678 if (!search_new_forks++)
1698 goto again; 1679 goto again;
@@ -1943,15 +1924,11 @@ static int __init ksm_init(void)
1943 if (err) 1924 if (err)
1944 goto out; 1925 goto out;
1945 1926
1946 err = mm_slots_hash_init();
1947 if (err)
1948 goto out_free1;
1949
1950 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 1927 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1951 if (IS_ERR(ksm_thread)) { 1928 if (IS_ERR(ksm_thread)) {
1952 printk(KERN_ERR "ksm: creating kthread failed\n"); 1929 printk(KERN_ERR "ksm: creating kthread failed\n");
1953 err = PTR_ERR(ksm_thread); 1930 err = PTR_ERR(ksm_thread);
1954 goto out_free2; 1931 goto out_free;
1955 } 1932 }
1956 1933
1957#ifdef CONFIG_SYSFS 1934#ifdef CONFIG_SYSFS
@@ -1959,7 +1936,7 @@ static int __init ksm_init(void)
1959 if (err) { 1936 if (err) {
1960 printk(KERN_ERR "ksm: register sysfs failed\n"); 1937 printk(KERN_ERR "ksm: register sysfs failed\n");
1961 kthread_stop(ksm_thread); 1938 kthread_stop(ksm_thread);
1962 goto out_free2; 1939 goto out_free;
1963 } 1940 }
1964#else 1941#else
1965 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 1942 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
@@ -1975,9 +1952,7 @@ static int __init ksm_init(void)
1975#endif 1952#endif
1976 return 0; 1953 return 0;
1977 1954
1978out_free2: 1955out_free:
1979 mm_slots_hash_free();
1980out_free1:
1981 ksm_slab_free(); 1956 ksm_slab_free();
1982out: 1957out:
1983 return err; 1958 return err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 3024eb30fc27..43840b305ecb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -504,7 +504,7 @@ int __init memblock_is_reserved(u64 addr)
504 504
505int memblock_is_region_reserved(u64 base, u64 size) 505int memblock_is_region_reserved(u64 base, u64 size)
506{ 506{
507 return memblock_overlaps_region(&memblock.reserved, base, size); 507 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
508} 508}
509 509
510/* 510/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..3eed583895a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,10 +47,13 @@
47#include <linux/mm_inline.h> 47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/oom.h>
50#include "internal.h" 51#include "internal.h"
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53 54
55#include <trace/events/vmscan.h>
56
54struct cgroup_subsys mem_cgroup_subsys __read_mostly; 57struct cgroup_subsys mem_cgroup_subsys __read_mostly;
55#define MEM_CGROUP_RECLAIM_RETRIES 5 58#define MEM_CGROUP_RECLAIM_RETRIES 5
56struct mem_cgroup *root_mem_cgroup __read_mostly; 59struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +214,6 @@ struct mem_cgroup {
211 */ 214 */
212 spinlock_t reclaim_param_lock; 215 spinlock_t reclaim_param_lock;
213 216
214 int prev_priority; /* for recording reclaim priority */
215
216 /* 217 /*
217 * While reclaiming in a hierarchy, we cache the last child we 218 * While reclaiming in a hierarchy, we cache the last child we
218 * reclaimed from. 219 * reclaimed from.
@@ -268,6 +269,7 @@ enum move_type {
268 269
269/* "mc" and its members are protected by cgroup_mutex */ 270/* "mc" and its members are protected by cgroup_mutex */
270static struct move_charge_struct { 271static struct move_charge_struct {
272 spinlock_t lock; /* for from, to, moving_task */
271 struct mem_cgroup *from; 273 struct mem_cgroup *from;
272 struct mem_cgroup *to; 274 struct mem_cgroup *to;
273 unsigned long precharge; 275 unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
276 struct task_struct *moving_task; /* a task moving charges */ 278 struct task_struct *moving_task; /* a task moving charges */
277 wait_queue_head_t waitq; /* a waitq for other context */ 279 wait_queue_head_t waitq; /* a waitq for other context */
278} mc = { 280} mc = {
281 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
279 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 282 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
280}; 283};
281 284
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
836{ 839{
837 int ret; 840 int ret;
838 struct mem_cgroup *curr = NULL; 841 struct mem_cgroup *curr = NULL;
842 struct task_struct *p;
839 843
840 task_lock(task); 844 p = find_lock_task_mm(task);
841 rcu_read_lock(); 845 if (!p)
842 curr = try_get_mem_cgroup_from_mm(task->mm); 846 return 0;
843 rcu_read_unlock(); 847 curr = try_get_mem_cgroup_from_mm(p->mm);
844 task_unlock(task); 848 task_unlock(p);
845 if (!curr) 849 if (!curr)
846 return 0; 850 return 0;
847 /* 851 /*
@@ -858,35 +862,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
858 return ret; 862 return ret;
859} 863}
860 864
861/*
862 * prev_priority control...this will be used in memory reclaim path.
863 */
864int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
865{
866 int prev_priority;
867
868 spin_lock(&mem->reclaim_param_lock);
869 prev_priority = mem->prev_priority;
870 spin_unlock(&mem->reclaim_param_lock);
871
872 return prev_priority;
873}
874
875void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
876{
877 spin_lock(&mem->reclaim_param_lock);
878 if (priority < mem->prev_priority)
879 mem->prev_priority = priority;
880 spin_unlock(&mem->reclaim_param_lock);
881}
882
883void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
884{
885 spin_lock(&mem->reclaim_param_lock);
886 mem->prev_priority = priority;
887 spin_unlock(&mem->reclaim_param_lock);
888}
889
890static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 865static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
891{ 866{
892 unsigned long active; 867 unsigned long active;
@@ -944,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
944 struct zone *zone, 919 struct zone *zone,
945 enum lru_list lru) 920 enum lru_list lru)
946{ 921{
947 int nid = zone->zone_pgdat->node_id; 922 int nid = zone_to_nid(zone);
948 int zid = zone_idx(zone); 923 int zid = zone_idx(zone);
949 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 924 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
950 925
@@ -954,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
954struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 929struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
955 struct zone *zone) 930 struct zone *zone)
956{ 931{
957 int nid = zone->zone_pgdat->node_id; 932 int nid = zone_to_nid(zone);
958 int zid = zone_idx(zone); 933 int zid = zone_idx(zone);
959 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 934 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
960 935
@@ -999,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
999 LIST_HEAD(pc_list); 974 LIST_HEAD(pc_list);
1000 struct list_head *src; 975 struct list_head *src;
1001 struct page_cgroup *pc, *tmp; 976 struct page_cgroup *pc, *tmp;
1002 int nid = z->zone_pgdat->node_id; 977 int nid = zone_to_nid(z);
1003 int zid = zone_idx(z); 978 int zid = zone_idx(z);
1004 struct mem_cgroup_per_zone *mz; 979 struct mem_cgroup_per_zone *mz;
1005 int lru = LRU_FILE * file + active; 980 int lru = LRU_FILE * file + active;
@@ -1038,6 +1013,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1038 } 1013 }
1039 1014
1040 *scanned = scan; 1015 *scanned = scan;
1016
1017 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1018 0, 0, 0, mode);
1019
1041 return nr_taken; 1020 return nr_taken;
1042} 1021}
1043 1022
@@ -1072,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1072 return swappiness; 1051 return swappiness;
1073} 1052}
1074 1053
1054/* A routine for testing mem is not under move_account */
1055
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{
1058 struct mem_cgroup *from;
1059 struct mem_cgroup *to;
1060 bool ret = false;
1061 /*
1062 * Unlike task_move routines, we access mc.to, mc.from not under
1063 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1064 */
1065 spin_lock(&mc.lock);
1066 from = mc.from;
1067 to = mc.to;
1068 if (!from)
1069 goto unlock;
1070 if (from == mem || to == mem
1071 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1072 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1073 ret = true;
1074unlock:
1075 spin_unlock(&mc.lock);
1076 return ret;
1077}
1078
1079static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1080{
1081 if (mc.moving_task && current != mc.moving_task) {
1082 if (mem_cgroup_under_move(mem)) {
1083 DEFINE_WAIT(wait);
1084 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1085 /* moving charge context might have finished. */
1086 if (mc.moving_task)
1087 schedule();
1088 finish_wait(&mc.waitq, &wait);
1089 return true;
1090 }
1091 }
1092 return false;
1093}
1094
1075static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1076{ 1096{
1077 int *val = data; 1097 int *val = data;
@@ -1158,6 +1178,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
1158} 1178}
1159 1179
1160/* 1180/*
1181 * Return the memory (and swap, if configured) limit for a memcg.
1182 */
1183u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1184{
1185 u64 limit;
1186 u64 memsw;
1187
1188 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1189 total_swap_pages;
1190 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1191 /*
1192 * If memsw is finite and limits the amount of swap space available
1193 * to this memcg, return that limit.
1194 */
1195 return min(limit, memsw);
1196}
1197
1198/*
1161 * Visit the first child (need not be the first child as per the ordering 1199 * Visit the first child (need not be the first child as per the ordering
1162 * of the cgroup list, since we track last_scanned_child) of @mem and use 1200 * of the cgroup list, since we track last_scanned_child) of @mem and use
1163 * that to reclaim free pages from. 1201 * that to reclaim free pages from.
@@ -1262,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1262 /* we use swappiness of local cgroup */ 1300 /* we use swappiness of local cgroup */
1263 if (check_soft) 1301 if (check_soft)
1264 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1302 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1265 noswap, get_swappiness(victim), zone, 1303 noswap, get_swappiness(victim), zone);
1266 zone->zone_pgdat->node_id);
1267 else 1304 else
1268 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1305 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1269 noswap, get_swappiness(victim)); 1306 noswap, get_swappiness(victim));
@@ -1370,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1370 1407
1371static void memcg_oom_recover(struct mem_cgroup *mem) 1408static void memcg_oom_recover(struct mem_cgroup *mem)
1372{ 1409{
1373 if (atomic_read(&mem->oom_lock)) 1410 if (mem && atomic_read(&mem->oom_lock))
1374 memcg_wakeup_oom(mem); 1411 memcg_wakeup_oom(mem);
1375} 1412}
1376 1413
@@ -1582,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1582 return NOTIFY_OK; 1619 return NOTIFY_OK;
1583} 1620}
1584 1621
1622
1623/* See __mem_cgroup_try_charge() for details */
1624enum {
1625 CHARGE_OK, /* success */
1626 CHARGE_RETRY, /* need to retry but retry is not bad */
1627 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
1628 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
1629 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1630};
1631
1632static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1633 int csize, bool oom_check)
1634{
1635 struct mem_cgroup *mem_over_limit;
1636 struct res_counter *fail_res;
1637 unsigned long flags = 0;
1638 int ret;
1639
1640 ret = res_counter_charge(&mem->res, csize, &fail_res);
1641
1642 if (likely(!ret)) {
1643 if (!do_swap_account)
1644 return CHARGE_OK;
1645 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1646 if (likely(!ret))
1647 return CHARGE_OK;
1648
1649 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1650 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1651 } else
1652 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1653
1654 if (csize > PAGE_SIZE) /* change csize and retry */
1655 return CHARGE_RETRY;
1656
1657 if (!(gfp_mask & __GFP_WAIT))
1658 return CHARGE_WOULDBLOCK;
1659
1660 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1661 gfp_mask, flags);
1662 /*
1663 * try_to_free_mem_cgroup_pages() might not give us a full
1664 * picture of reclaim. Some pages are reclaimed and might be
1665 * moved to swap cache or just unmapped from the cgroup.
1666 * Check the limit again to see if the reclaim reduced the
1667 * current usage of the cgroup before giving up
1668 */
1669 if (ret || mem_cgroup_check_under_limit(mem_over_limit))
1670 return CHARGE_RETRY;
1671
1672 /*
1673 * At task move, charge accounts can be doubly counted. So, it's
1674 * better to wait until the end of task_move if something is going on.
1675 */
1676 if (mem_cgroup_wait_acct_move(mem_over_limit))
1677 return CHARGE_RETRY;
1678
1679 /* If we don't need to call oom-killer at el, return immediately */
1680 if (!oom_check)
1681 return CHARGE_NOMEM;
1682 /* check OOM */
1683 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1684 return CHARGE_OOM_DIE;
1685
1686 return CHARGE_RETRY;
1687}
1688
1585/* 1689/*
1586 * Unlike exported interface, "oom" parameter is added. if oom==true, 1690 * Unlike exported interface, "oom" parameter is added. if oom==true,
1587 * oom-killer can be invoked. 1691 * oom-killer can be invoked.
1588 */ 1692 */
1589static int __mem_cgroup_try_charge(struct mm_struct *mm, 1693static int __mem_cgroup_try_charge(struct mm_struct *mm,
1590 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1694 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1591{ 1695{
1592 struct mem_cgroup *mem, *mem_over_limit; 1696 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1593 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1697 struct mem_cgroup *mem = NULL;
1594 struct res_counter *fail_res; 1698 int ret;
1595 int csize = CHARGE_SIZE; 1699 int csize = CHARGE_SIZE;
1596 1700
1597 /* 1701 /*
@@ -1609,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1609 * thread group leader migrates. It's possible that mm is not 1713 * thread group leader migrates. It's possible that mm is not
1610 * set, if so charge the init_mm (happens for pagecache usage). 1714 * set, if so charge the init_mm (happens for pagecache usage).
1611 */ 1715 */
1612 mem = *memcg; 1716 if (!*memcg && !mm)
1613 if (likely(!mem)) { 1717 goto bypass;
1614 mem = try_get_mem_cgroup_from_mm(mm); 1718again:
1615 *memcg = mem; 1719 if (*memcg) { /* css should be a valid one */
1616 } else { 1720 mem = *memcg;
1617 css_get(&mem->css); 1721 VM_BUG_ON(css_is_removed(&mem->css));
1618 } 1722 if (mem_cgroup_is_root(mem))
1619 if (unlikely(!mem)) 1723 goto done;
1620 return 0;
1621
1622 VM_BUG_ON(css_is_removed(&mem->css));
1623 if (mem_cgroup_is_root(mem))
1624 goto done;
1625
1626 while (1) {
1627 int ret = 0;
1628 unsigned long flags = 0;
1629
1630 if (consume_stock(mem)) 1724 if (consume_stock(mem))
1631 goto done; 1725 goto done;
1726 css_get(&mem->css);
1727 } else {
1728 struct task_struct *p;
1632 1729
1633 ret = res_counter_charge(&mem->res, csize, &fail_res); 1730 rcu_read_lock();
1634 if (likely(!ret)) { 1731 p = rcu_dereference(mm->owner);
1635 if (!do_swap_account) 1732 VM_BUG_ON(!p);
1636 break;
1637 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1638 if (likely(!ret))
1639 break;
1640 /* mem+swap counter fails */
1641 res_counter_uncharge(&mem->res, csize);
1642 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1643 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1644 memsw);
1645 } else
1646 /* mem counter fails */
1647 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1648 res);
1649
1650 /* reduce request size and retry */
1651 if (csize > PAGE_SIZE) {
1652 csize = PAGE_SIZE;
1653 continue;
1654 }
1655 if (!(gfp_mask & __GFP_WAIT))
1656 goto nomem;
1657
1658 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1659 gfp_mask, flags);
1660 if (ret)
1661 continue;
1662
1663 /* 1733 /*
1664 * try_to_free_mem_cgroup_pages() might not give us a full 1734 * because we don't have task_lock(), "p" can exit while
1665 * picture of reclaim. Some pages are reclaimed and might be 1735 * we're here. In that case, "mem" can point to root
1666 * moved to swap cache or just unmapped from the cgroup. 1736 * cgroup but never be NULL. (and task_struct itself is freed
1667 * Check the limit again to see if the reclaim reduced the 1737 * by RCU, cgroup itself is RCU safe.) Then, we have small
1668 * current usage of the cgroup before giving up 1738 * risk here to get wrong cgroup. But such kind of mis-account
1669 * 1739 * by race always happens because we don't have cgroup_mutex().
1740 * It's overkill and we allow that small race, here.
1670 */ 1741 */
1671 if (mem_cgroup_check_under_limit(mem_over_limit)) 1742 mem = mem_cgroup_from_task(p);
1672 continue; 1743 VM_BUG_ON(!mem);
1673 1744 if (mem_cgroup_is_root(mem)) {
1674 /* try to avoid oom while someone is moving charge */ 1745 rcu_read_unlock();
1675 if (mc.moving_task && current != mc.moving_task) { 1746 goto done;
1676 struct mem_cgroup *from, *to; 1747 }
1677 bool do_continue = false; 1748 if (consume_stock(mem)) {
1678 /* 1749 /*
1679 * There is a small race that "from" or "to" can be 1750 * It seems dagerous to access memcg without css_get().
1680 * freed by rmdir, so we use css_tryget(). 1751 * But considering how consume_stok works, it's not
1752 * necessary. If consume_stock success, some charges
1753 * from this memcg are cached on this cpu. So, we
1754 * don't need to call css_get()/css_tryget() before
1755 * calling consume_stock().
1681 */ 1756 */
1682 from = mc.from; 1757 rcu_read_unlock();
1683 to = mc.to; 1758 goto done;
1684 if (from && css_tryget(&from->css)) { 1759 }
1685 if (mem_over_limit->use_hierarchy) 1760 /* after here, we may be blocked. we need to get refcnt */
1686 do_continue = css_is_ancestor( 1761 if (!css_tryget(&mem->css)) {
1687 &from->css, 1762 rcu_read_unlock();
1688 &mem_over_limit->css); 1763 goto again;
1689 else 1764 }
1690 do_continue = (from == mem_over_limit); 1765 rcu_read_unlock();
1691 css_put(&from->css); 1766 }
1692 } 1767
1693 if (!do_continue && to && css_tryget(&to->css)) { 1768 do {
1694 if (mem_over_limit->use_hierarchy) 1769 bool oom_check;
1695 do_continue = css_is_ancestor( 1770
1696 &to->css, 1771 /* If killed, bypass charge */
1697 &mem_over_limit->css); 1772 if (fatal_signal_pending(current)) {
1698 else 1773 css_put(&mem->css);
1699 do_continue = (to == mem_over_limit); 1774 goto bypass;
1700 css_put(&to->css); 1775 }
1701 } 1776
1702 if (do_continue) { 1777 oom_check = false;
1703 DEFINE_WAIT(wait); 1778 if (oom && !nr_oom_retries) {
1704 prepare_to_wait(&mc.waitq, &wait, 1779 oom_check = true;
1705 TASK_INTERRUPTIBLE); 1780 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1706 /* moving charge context might have finished. */
1707 if (mc.moving_task)
1708 schedule();
1709 finish_wait(&mc.waitq, &wait);
1710 continue;
1711 }
1712 } 1781 }
1713 1782
1714 if (!nr_retries--) { 1783 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
1715 if (!oom) 1784
1785 switch (ret) {
1786 case CHARGE_OK:
1787 break;
1788 case CHARGE_RETRY: /* not in OOM situation but retry */
1789 csize = PAGE_SIZE;
1790 css_put(&mem->css);
1791 mem = NULL;
1792 goto again;
1793 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
1794 css_put(&mem->css);
1795 goto nomem;
1796 case CHARGE_NOMEM: /* OOM routine works */
1797 if (!oom) {
1798 css_put(&mem->css);
1716 goto nomem; 1799 goto nomem;
1717 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1718 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1719 continue;
1720 } 1800 }
1721 /* When we reach here, current task is dying .*/ 1801 /* If oom, we never return -ENOMEM */
1802 nr_oom_retries--;
1803 break;
1804 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
1722 css_put(&mem->css); 1805 css_put(&mem->css);
1723 goto bypass; 1806 goto bypass;
1724 } 1807 }
1725 } 1808 } while (ret != CHARGE_OK);
1809
1726 if (csize > PAGE_SIZE) 1810 if (csize > PAGE_SIZE)
1727 refill_stock(mem, csize - PAGE_SIZE); 1811 refill_stock(mem, csize - PAGE_SIZE);
1812 css_put(&mem->css);
1728done: 1813done:
1814 *memcg = mem;
1729 return 0; 1815 return 0;
1730nomem: 1816nomem:
1731 css_put(&mem->css); 1817 *memcg = NULL;
1732 return -ENOMEM; 1818 return -ENOMEM;
1733bypass: 1819bypass:
1734 *memcg = NULL; 1820 *memcg = NULL;
@@ -1747,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1747 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1833 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1748 if (do_swap_account) 1834 if (do_swap_account)
1749 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1835 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1750 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1751 WARN_ON_ONCE(count > INT_MAX);
1752 __css_put(&mem->css, (int)count);
1753 } 1836 }
1754 /* we don't need css_put for root */
1755} 1837}
1756 1838
1757static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1839static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1979,10 +2061,9 @@ out:
1979 * < 0 if the cgroup is over its limit 2061 * < 0 if the cgroup is over its limit
1980 */ 2062 */
1981static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2063static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1982 gfp_t gfp_mask, enum charge_type ctype, 2064 gfp_t gfp_mask, enum charge_type ctype)
1983 struct mem_cgroup *memcg)
1984{ 2065{
1985 struct mem_cgroup *mem; 2066 struct mem_cgroup *mem = NULL;
1986 struct page_cgroup *pc; 2067 struct page_cgroup *pc;
1987 int ret; 2068 int ret;
1988 2069
@@ -1992,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1992 return 0; 2073 return 0;
1993 prefetchw(pc); 2074 prefetchw(pc);
1994 2075
1995 mem = memcg;
1996 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2076 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1997 if (ret || !mem) 2077 if (ret || !mem)
1998 return ret; 2078 return ret;
@@ -2020,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2020 if (unlikely(!mm)) 2100 if (unlikely(!mm))
2021 mm = &init_mm; 2101 mm = &init_mm;
2022 return mem_cgroup_charge_common(page, mm, gfp_mask, 2102 return mem_cgroup_charge_common(page, mm, gfp_mask,
2023 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 2103 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2024} 2104}
2025 2105
2026static void 2106static void
@@ -2030,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2030int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2110int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2031 gfp_t gfp_mask) 2111 gfp_t gfp_mask)
2032{ 2112{
2033 struct mem_cgroup *mem = NULL;
2034 int ret; 2113 int ret;
2035 2114
2036 if (mem_cgroup_disabled()) 2115 if (mem_cgroup_disabled())
@@ -2051,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2051 if (!(gfp_mask & __GFP_WAIT)) { 2130 if (!(gfp_mask & __GFP_WAIT)) {
2052 struct page_cgroup *pc; 2131 struct page_cgroup *pc;
2053 2132
2054
2055 pc = lookup_page_cgroup(page); 2133 pc = lookup_page_cgroup(page);
2056 if (!pc) 2134 if (!pc)
2057 return 0; 2135 return 0;
@@ -2063,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2063 unlock_page_cgroup(pc); 2141 unlock_page_cgroup(pc);
2064 } 2142 }
2065 2143
2066 if (unlikely(!mm && !mem)) 2144 if (unlikely(!mm))
2067 mm = &init_mm; 2145 mm = &init_mm;
2068 2146
2069 if (page_is_file_cache(page)) 2147 if (page_is_file_cache(page))
2070 return mem_cgroup_charge_common(page, mm, gfp_mask, 2148 return mem_cgroup_charge_common(page, mm, gfp_mask,
2071 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 2149 MEM_CGROUP_CHARGE_TYPE_CACHE);
2072 2150
2073 /* shmem */ 2151 /* shmem */
2074 if (PageSwapCache(page)) { 2152 if (PageSwapCache(page)) {
2153 struct mem_cgroup *mem = NULL;
2154
2075 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2155 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2076 if (!ret) 2156 if (!ret)
2077 __mem_cgroup_commit_charge_swapin(page, mem, 2157 __mem_cgroup_commit_charge_swapin(page, mem,
2078 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2158 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2079 } else 2159 } else
2080 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2160 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2081 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 2161 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2082 2162
2083 return ret; 2163 return ret;
2084} 2164}
@@ -2114,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2114 goto charge_cur_mm; 2194 goto charge_cur_mm;
2115 *ptr = mem; 2195 *ptr = mem;
2116 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2196 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2117 /* drop extra refcnt from tryget */
2118 css_put(&mem->css); 2197 css_put(&mem->css);
2119 return ret; 2198 return ret;
2120charge_cur_mm: 2199charge_cur_mm:
@@ -2245,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2245{ 2324{
2246 struct page_cgroup *pc; 2325 struct page_cgroup *pc;
2247 struct mem_cgroup *mem = NULL; 2326 struct mem_cgroup *mem = NULL;
2248 struct mem_cgroup_per_zone *mz;
2249 2327
2250 if (mem_cgroup_disabled()) 2328 if (mem_cgroup_disabled())
2251 return NULL; 2329 return NULL;
@@ -2285,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2285 break; 2363 break;
2286 } 2364 }
2287 2365
2288 if (!mem_cgroup_is_root(mem))
2289 __do_uncharge(mem, ctype);
2290 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2291 mem_cgroup_swap_statistics(mem, true);
2292 mem_cgroup_charge_statistics(mem, pc, false); 2366 mem_cgroup_charge_statistics(mem, pc, false);
2293 2367
2294 ClearPageCgroupUsed(pc); 2368 ClearPageCgroupUsed(pc);
@@ -2299,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2299 * special functions. 2373 * special functions.
2300 */ 2374 */
2301 2375
2302 mz = page_cgroup_zoneinfo(pc);
2303 unlock_page_cgroup(pc); 2376 unlock_page_cgroup(pc);
2304 2377 /*
2378 * even after unlock, we have mem->res.usage here and this memcg
2379 * will never be freed.
2380 */
2305 memcg_check_events(mem, page); 2381 memcg_check_events(mem, page);
2306 /* at swapout, this memcg will be accessed to record to swap */ 2382 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2307 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2383 mem_cgroup_swap_statistics(mem, true);
2308 css_put(&mem->css); 2384 mem_cgroup_get(mem);
2385 }
2386 if (!mem_cgroup_is_root(mem))
2387 __do_uncharge(mem, ctype);
2309 2388
2310 return mem; 2389 return mem;
2311 2390
@@ -2392,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2392 2471
2393 memcg = __mem_cgroup_uncharge_common(page, ctype); 2472 memcg = __mem_cgroup_uncharge_common(page, ctype);
2394 2473
2395 /* record memcg information */ 2474 /*
2396 if (do_swap_account && swapout && memcg) { 2475 * record memcg information, if swapout && memcg != NULL,
2476 * mem_cgroup_get() was called in uncharge().
2477 */
2478 if (do_swap_account && swapout && memcg)
2397 swap_cgroup_record(ent, css_id(&memcg->css)); 2479 swap_cgroup_record(ent, css_id(&memcg->css));
2398 mem_cgroup_get(memcg);
2399 }
2400 if (swapout && memcg)
2401 css_put(&memcg->css);
2402} 2480}
2403#endif 2481#endif
2404 2482
@@ -2476,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
2476 */ 2554 */
2477 if (!mem_cgroup_is_root(to)) 2555 if (!mem_cgroup_is_root(to))
2478 res_counter_uncharge(&to->res, PAGE_SIZE); 2556 res_counter_uncharge(&to->res, PAGE_SIZE);
2479 css_put(&to->css);
2480 } 2557 }
2481 return 0; 2558 return 0;
2482 } 2559 }
@@ -2611,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2611 ClearPageCgroupMigration(pc); 2688 ClearPageCgroupMigration(pc);
2612 unlock_page_cgroup(pc); 2689 unlock_page_cgroup(pc);
2613 2690
2614 if (unused != oldpage)
2615 pc = lookup_page_cgroup(unused);
2616 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 2691 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2617 2692
2618 pc = lookup_page_cgroup(used);
2619 /* 2693 /*
2620 * If a page is a file cache, radix-tree replacement is very atomic 2694 * If a page is a file cache, radix-tree replacement is very atomic
2621 * and we can skip this check. When it was an Anon page, its mapcount 2695 * and we can skip this check. When it was an Anon page, its mapcount
@@ -2791,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2791} 2865}
2792 2866
2793unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2867unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2794 gfp_t gfp_mask, int nid, 2868 gfp_t gfp_mask)
2795 int zid)
2796{ 2869{
2797 unsigned long nr_reclaimed = 0; 2870 unsigned long nr_reclaimed = 0;
2798 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2871 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2804,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2804 if (order > 0) 2877 if (order > 0)
2805 return 0; 2878 return 0;
2806 2879
2807 mctz = soft_limit_tree_node_zone(nid, zid); 2880 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2808 /* 2881 /*
2809 * This loop can run a while, specially if mem_cgroup's continuously 2882 * This loop can run a while, specially if mem_cgroup's continuously
2810 * keep exceeding their soft limit and putting the system under 2883 * keep exceeding their soft limit and putting the system under
@@ -3759,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3759 return 0; 3832 return 0;
3760} 3833}
3761 3834
3762/*
3763 */
3764static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 3835static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3765 struct cftype *cft, u64 val) 3836 struct cftype *cft, u64 val)
3766{ 3837{
@@ -4180,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
4180 goto one_by_one; 4251 goto one_by_one;
4181 } 4252 }
4182 mc.precharge += count; 4253 mc.precharge += count;
4183 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4184 WARN_ON_ONCE(count > INT_MAX);
4185 __css_get(&mem->css, (int)count);
4186 return ret; 4254 return ret;
4187 } 4255 }
4188one_by_one: 4256one_by_one:
@@ -4400,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4400 4468
4401static void mem_cgroup_clear_mc(void) 4469static void mem_cgroup_clear_mc(void)
4402{ 4470{
4471 struct mem_cgroup *from = mc.from;
4472 struct mem_cgroup *to = mc.to;
4473
4403 /* we must uncharge all the leftover precharges from mc.to */ 4474 /* we must uncharge all the leftover precharges from mc.to */
4404 if (mc.precharge) { 4475 if (mc.precharge) {
4405 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4476 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4406 mc.precharge = 0; 4477 mc.precharge = 0;
4407 memcg_oom_recover(mc.to);
4408 } 4478 }
4409 /* 4479 /*
4410 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4480 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4413,11 +4483,9 @@ static void mem_cgroup_clear_mc(void)
4413 if (mc.moved_charge) { 4483 if (mc.moved_charge) {
4414 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4484 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4415 mc.moved_charge = 0; 4485 mc.moved_charge = 0;
4416 memcg_oom_recover(mc.from);
4417 } 4486 }
4418 /* we must fixup refcnts and charges */ 4487 /* we must fixup refcnts and charges */
4419 if (mc.moved_swap) { 4488 if (mc.moved_swap) {
4420 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4421 /* uncharge swap account from the old cgroup */ 4489 /* uncharge swap account from the old cgroup */
4422 if (!mem_cgroup_is_root(mc.from)) 4490 if (!mem_cgroup_is_root(mc.from))
4423 res_counter_uncharge(&mc.from->memsw, 4491 res_counter_uncharge(&mc.from->memsw,
@@ -4431,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
4431 */ 4499 */
4432 res_counter_uncharge(&mc.to->res, 4500 res_counter_uncharge(&mc.to->res,
4433 PAGE_SIZE * mc.moved_swap); 4501 PAGE_SIZE * mc.moved_swap);
4434 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4435 __css_put(&mc.to->css, mc.moved_swap);
4436 } 4502 }
4437 /* we've already done mem_cgroup_get(mc.to) */ 4503 /* we've already done mem_cgroup_get(mc.to) */
4438 4504
4439 mc.moved_swap = 0; 4505 mc.moved_swap = 0;
4440 } 4506 }
4507 spin_lock(&mc.lock);
4441 mc.from = NULL; 4508 mc.from = NULL;
4442 mc.to = NULL; 4509 mc.to = NULL;
4443 mc.moving_task = NULL; 4510 mc.moving_task = NULL;
4511 spin_unlock(&mc.lock);
4512 memcg_oom_recover(from);
4513 memcg_oom_recover(to);
4444 wake_up_all(&mc.waitq); 4514 wake_up_all(&mc.waitq);
4445} 4515}
4446 4516
@@ -4469,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4469 VM_BUG_ON(mc.moved_charge); 4539 VM_BUG_ON(mc.moved_charge);
4470 VM_BUG_ON(mc.moved_swap); 4540 VM_BUG_ON(mc.moved_swap);
4471 VM_BUG_ON(mc.moving_task); 4541 VM_BUG_ON(mc.moving_task);
4542 spin_lock(&mc.lock);
4472 mc.from = from; 4543 mc.from = from;
4473 mc.to = mem; 4544 mc.to = mem;
4474 mc.precharge = 0; 4545 mc.precharge = 0;
4475 mc.moved_charge = 0; 4546 mc.moved_charge = 0;
4476 mc.moved_swap = 0; 4547 mc.moved_swap = 0;
4477 mc.moving_task = current; 4548 mc.moving_task = current;
4549 spin_unlock(&mc.lock);
4478 4550
4479 ret = mem_cgroup_precharge_mc(mm); 4551 ret = mem_cgroup_precharge_mc(mm);
4480 if (ret) 4552 if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52cacaa..9c26eeca1342 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/swapops.h> 48#include <linux/swapops.h>
49#include <linux/hugetlb.h>
49#include "internal.h" 50#include "internal.h"
50 51
51int sysctl_memory_failure_early_kill __read_mostly = 0; 52int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
690/* 691/*
691 * Huge pages. Needs work. 692 * Huge pages. Needs work.
692 * Issues: 693 * Issues:
693 * No rmap support so we cannot find the original mapper. In theory could walk 694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
694 * all MMs and look for the mappings, but that would be non atomic and racy. 695 * To narrow down kill region to one page, we need to break up pmd.
695 * Need rmap for hugepages for this. Alternatively we could employ a heuristic, 696 * - To support soft-offlining for hugepage, we need to support hugepage
696 * like just walking the current process and hoping it has it mapped (that 697 * migration.
697 * should be usually true for the common "shared database cache" case)
698 * Should handle free huge pages and dequeue them too, but this needs to
699 * handle huge page accounting correctly.
700 */ 698 */
701static int me_huge_page(struct page *p, unsigned long pfn) 699static int me_huge_page(struct page *p, unsigned long pfn)
702{ 700{
703 return FAILED; 701 struct page *hpage = compound_head(p);
702 /*
703 * We can safely recover from error on free or reserved (i.e.
704 * not in-use) hugepage by dequeuing it from freelist.
705 * To check whether a hugepage is in-use or not, we can't use
706 * page->lru because it can be used in other hugepage operations,
707 * such as __unmap_hugepage_range() and gather_surplus_pages().
708 * So instead we use page_mapping() and PageAnon().
709 * We assume that this function is called with page lock held,
710 * so there is no race between isolation and mapping/unmapping.
711 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage);
714 return RECOVERED;
715 }
716 return DELAYED;
704} 717}
705 718
706/* 719/*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
838 int ret; 851 int ret;
839 int i; 852 int i;
840 int kill = 1; 853 int kill = 1;
854 struct page *hpage = compound_head(p);
841 855
842 if (PageReserved(p) || PageSlab(p)) 856 if (PageReserved(p) || PageSlab(p))
843 return SWAP_SUCCESS; 857 return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
846 * This check implies we don't kill processes if their pages 860 * This check implies we don't kill processes if their pages
847 * are in the swap cache early. Those are always late kills. 861 * are in the swap cache early. Those are always late kills.
848 */ 862 */
849 if (!page_mapped(p)) 863 if (!page_mapped(hpage))
850 return SWAP_SUCCESS; 864 return SWAP_SUCCESS;
851 865
852 if (PageCompound(p) || PageKsm(p)) 866 if (PageKsm(p))
853 return SWAP_FAIL; 867 return SWAP_FAIL;
854 868
855 if (PageSwapCache(p)) { 869 if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
864 * XXX: the dirty test could be racy: set_page_dirty() may not always 878 * XXX: the dirty test could be racy: set_page_dirty() may not always
865 * be called inside page lock (it's recommended but not enforced). 879 * be called inside page lock (it's recommended but not enforced).
866 */ 880 */
867 mapping = page_mapping(p); 881 mapping = page_mapping(hpage);
868 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 882 if (!PageDirty(hpage) && mapping &&
869 if (page_mkclean(p)) { 883 mapping_cap_writeback_dirty(mapping)) {
870 SetPageDirty(p); 884 if (page_mkclean(hpage)) {
885 SetPageDirty(hpage);
871 } else { 886 } else {
872 kill = 0; 887 kill = 0;
873 ttu |= TTU_IGNORE_HWPOISON; 888 ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
886 * there's nothing that can be done. 901 * there's nothing that can be done.
887 */ 902 */
888 if (kill) 903 if (kill)
889 collect_procs(p, &tokill); 904 collect_procs(hpage, &tokill);
890 905
891 /* 906 /*
892 * try_to_unmap can fail temporarily due to races. 907 * try_to_unmap can fail temporarily due to races.
893 * Try a few times (RED-PEN better strategy?) 908 * Try a few times (RED-PEN better strategy?)
894 */ 909 */
895 for (i = 0; i < N_UNMAP_TRIES; i++) { 910 for (i = 0; i < N_UNMAP_TRIES; i++) {
896 ret = try_to_unmap(p, ttu); 911 ret = try_to_unmap(hpage, ttu);
897 if (ret == SWAP_SUCCESS) 912 if (ret == SWAP_SUCCESS)
898 break; 913 break;
899 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); 914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
901 916
902 if (ret != SWAP_SUCCESS) 917 if (ret != SWAP_SUCCESS)
903 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
904 pfn, page_mapcount(p)); 919 pfn, page_mapcount(hpage));
905 920
906 /* 921 /*
907 * Now that the dirty bit has been propagated to the 922 * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
912 * use a more force-full uncatchable kill to prevent 927 * use a more force-full uncatchable kill to prevent
913 * any accesses to the poisoned memory. 928 * any accesses to the poisoned memory.
914 */ 929 */
915 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
916 ret != SWAP_SUCCESS, pfn); 931 ret != SWAP_SUCCESS, pfn);
917 932
918 return ret; 933 return ret;
919} 934}
920 935
936static void set_page_hwpoison_huge_page(struct page *hpage)
937{
938 int i;
939 int nr_pages = 1 << compound_order(hpage);
940 for (i = 0; i < nr_pages; i++)
941 SetPageHWPoison(hpage + i);
942}
943
944static void clear_page_hwpoison_huge_page(struct page *hpage)
945{
946 int i;
947 int nr_pages = 1 << compound_order(hpage);
948 for (i = 0; i < nr_pages; i++)
949 ClearPageHWPoison(hpage + i);
950}
951
921int __memory_failure(unsigned long pfn, int trapno, int flags) 952int __memory_failure(unsigned long pfn, int trapno, int flags)
922{ 953{
923 struct page_state *ps; 954 struct page_state *ps;
924 struct page *p; 955 struct page *p;
956 struct page *hpage;
925 int res; 957 int res;
958 unsigned int nr_pages;
926 959
927 if (!sysctl_memory_failure_recovery) 960 if (!sysctl_memory_failure_recovery)
928 panic("Memory failure from trap %d on page %lx", trapno, pfn); 961 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
935 } 968 }
936 969
937 p = pfn_to_page(pfn); 970 p = pfn_to_page(pfn);
971 hpage = compound_head(p);
938 if (TestSetPageHWPoison(p)) { 972 if (TestSetPageHWPoison(p)) {
939 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 973 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
940 return 0; 974 return 0;
941 } 975 }
942 976
943 atomic_long_add(1, &mce_bad_pages); 977 nr_pages = 1 << compound_order(hpage);
978 atomic_long_add(nr_pages, &mce_bad_pages);
944 979
945 /* 980 /*
946 * We need/can do nothing about count=0 pages. 981 * We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
954 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 989 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
955 */ 990 */
956 if (!(flags & MF_COUNT_INCREASED) && 991 if (!(flags & MF_COUNT_INCREASED) &&
957 !get_page_unless_zero(compound_head(p))) { 992 !get_page_unless_zero(hpage)) {
958 if (is_free_buddy_page(p)) { 993 if (is_free_buddy_page(p)) {
959 action_result(pfn, "free buddy", DELAYED); 994 action_result(pfn, "free buddy", DELAYED);
960 return 0; 995 return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
972 * The check (unnecessarily) ignores LRU pages being isolated and 1007 * The check (unnecessarily) ignores LRU pages being isolated and
973 * walked by the page reclaim code, however that's not a big loss. 1008 * walked by the page reclaim code, however that's not a big loss.
974 */ 1009 */
975 if (!PageLRU(p)) 1010 if (!PageLRU(p) && !PageHuge(p))
976 shake_page(p, 0); 1011 shake_page(p, 0);
977 if (!PageLRU(p)) { 1012 if (!PageLRU(p) && !PageHuge(p)) {
978 /* 1013 /*
979 * shake_page could have turned it free. 1014 * shake_page could have turned it free.
980 */ 1015 */
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
992 * It's very difficult to mess with pages currently under IO 1027 * It's very difficult to mess with pages currently under IO
993 * and in many cases impossible, so we just avoid it here. 1028 * and in many cases impossible, so we just avoid it here.
994 */ 1029 */
995 lock_page_nosync(p); 1030 lock_page_nosync(hpage);
996 1031
997 /* 1032 /*
998 * unpoison always clear PG_hwpoison inside page lock 1033 * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1004 } 1039 }
1005 if (hwpoison_filter(p)) { 1040 if (hwpoison_filter(p)) {
1006 if (TestClearPageHWPoison(p)) 1041 if (TestClearPageHWPoison(p))
1007 atomic_long_dec(&mce_bad_pages); 1042 atomic_long_sub(nr_pages, &mce_bad_pages);
1008 unlock_page(p); 1043 unlock_page(hpage);
1009 put_page(p); 1044 put_page(hpage);
1045 return 0;
1046 }
1047
1048 /*
1049 * For error on the tail page, we should set PG_hwpoison
1050 * on the head page to show that the hugepage is hwpoisoned
1051 */
1052 if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1053 action_result(pfn, "hugepage already hardware poisoned",
1054 IGNORED);
1055 unlock_page(hpage);
1056 put_page(hpage);
1010 return 0; 1057 return 0;
1011 } 1058 }
1059 /*
1060 * Set PG_hwpoison on all pages in an error hugepage,
1061 * because containment is done in hugepage unit for now.
1062 * Since we have done TestSetPageHWPoison() for the head page with
1063 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1064 */
1065 if (PageHuge(p))
1066 set_page_hwpoison_huge_page(hpage);
1012 1067
1013 wait_on_page_writeback(p); 1068 wait_on_page_writeback(p);
1014 1069
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1039 } 1094 }
1040 } 1095 }
1041out: 1096out:
1042 unlock_page(p); 1097 unlock_page(hpage);
1043 return res; 1098 return res;
1044} 1099}
1045EXPORT_SYMBOL_GPL(__memory_failure); 1100EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
1083 struct page *page; 1138 struct page *page;
1084 struct page *p; 1139 struct page *p;
1085 int freeit = 0; 1140 int freeit = 0;
1141 unsigned int nr_pages;
1086 1142
1087 if (!pfn_valid(pfn)) 1143 if (!pfn_valid(pfn))
1088 return -ENXIO; 1144 return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
1095 return 0; 1151 return 0;
1096 } 1152 }
1097 1153
1154 nr_pages = 1 << compound_order(page);
1155
1098 if (!get_page_unless_zero(page)) { 1156 if (!get_page_unless_zero(page)) {
1099 if (TestClearPageHWPoison(p)) 1157 if (TestClearPageHWPoison(p))
1100 atomic_long_dec(&mce_bad_pages); 1158 atomic_long_sub(nr_pages, &mce_bad_pages);
1101 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1102 return 0; 1160 return 0;
1103 } 1161 }
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
1109 * the PG_hwpoison page will be caught and isolated on the entrance to 1167 * the PG_hwpoison page will be caught and isolated on the entrance to
1110 * the free buddy page pool. 1168 * the free buddy page pool.
1111 */ 1169 */
1112 if (TestClearPageHWPoison(p)) { 1170 if (TestClearPageHWPoison(page)) {
1113 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1114 atomic_long_dec(&mce_bad_pages); 1172 atomic_long_sub(nr_pages, &mce_bad_pages);
1115 freeit = 1; 1173 freeit = 1;
1116 } 1174 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1117 unlock_page(page); 1177 unlock_page(page);
1118 1178
1119 put_page(page); 1179 put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d3633..2ed2267439df 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
307{ 307{
308 pgd_t *pgd; 308 pgd_t *pgd;
309 unsigned long next; 309 unsigned long next;
310 unsigned long start;
311 310
312 /* 311 /*
313 * The next few lines have given us lots of grief... 312 * The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
351 if (addr > end - 1) 350 if (addr > end - 1)
352 return; 351 return;
353 352
354 start = addr;
355 pgd = pgd_offset(tlb->mm, addr); 353 pgd = pgd_offset(tlb->mm, addr);
356 do { 354 do {
357 next = pgd_addr_end(addr, end); 355 next = pgd_addr_end(addr, end);
@@ -2008,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2008{ 2006{
2009 pgd_t *pgd; 2007 pgd_t *pgd;
2010 unsigned long next; 2008 unsigned long next;
2011 unsigned long start = addr, end = addr + size; 2009 unsigned long end = addr + size;
2012 int err; 2010 int err;
2013 2011
2014 BUG_ON(addr >= end); 2012 BUG_ON(addr >= end);
2015 mmu_notifier_invalidate_range_start(mm, start, end);
2016 pgd = pgd_offset(mm, addr); 2013 pgd = pgd_offset(mm, addr);
2017 do { 2014 do {
2018 next = pgd_addr_end(addr, end); 2015 next = pgd_addr_end(addr, end);
@@ -2020,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2020 if (err) 2017 if (err)
2021 break; 2018 break;
2022 } while (pgd++, addr = next, addr != end); 2019 } while (pgd++, addr = next, addr != end);
2023 mmu_notifier_invalidate_range_end(mm, start, end); 2020
2024 return err; 2021 return err;
2025} 2022}
2026EXPORT_SYMBOL_GPL(apply_to_page_range); 2023EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2630,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2630 swp_entry_t entry; 2627 swp_entry_t entry;
2631 pte_t pte; 2628 pte_t pte;
2632 struct mem_cgroup *ptr = NULL; 2629 struct mem_cgroup *ptr = NULL;
2630 int exclusive = 0;
2633 int ret = 0; 2631 int ret = 0;
2634 2632
2635 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2633 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2724,10 +2722,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2724 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2722 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2725 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2723 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2726 flags &= ~FAULT_FLAG_WRITE; 2724 flags &= ~FAULT_FLAG_WRITE;
2725 ret |= VM_FAULT_WRITE;
2726 exclusive = 1;
2727 } 2727 }
2728 flush_icache_page(vma, page); 2728 flush_icache_page(vma, page);
2729 set_pte_at(mm, address, page_table, pte); 2729 set_pte_at(mm, address, page_table, pte);
2730 page_add_anon_rmap(page, vma, address); 2730 do_page_add_anon_rmap(page, vma, address, exclusive);
2731 /* It's better to call commit-charge after rmap is established */ 2731 /* It's better to call commit-charge after rmap is established */
2732 mem_cgroup_commit_charge_swapin(page, ptr); 2732 mem_cgroup_commit_charge_swapin(page, ptr);
2733 2733
@@ -2760,6 +2760,33 @@ out_release:
2760} 2760}
2761 2761
2762/* 2762/*
2763 * This is like a special single-page "expand_downwards()",
2764 * except we must first make sure that 'address-PAGE_SIZE'
2765 * doesn't hit another vma.
2766 *
2767 * The "find_vma()" will do the right thing even if we wrap
2768 */
2769static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2770{
2771 address &= PAGE_MASK;
2772 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2773 struct vm_area_struct *prev = vma->vm_prev;
2774
2775 /*
2776 * Is there a mapping abutting this one below?
2777 *
2778 * That's only ok if it's the same stack mapping
2779 * that has gotten split..
2780 */
2781 if (prev && prev->vm_end == address)
2782 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2783
2784 expand_stack(vma, address - PAGE_SIZE);
2785 }
2786 return 0;
2787}
2788
2789/*
2763 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2790 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2764 * but allow concurrent faults), and pte mapped but not yet locked. 2791 * but allow concurrent faults), and pte mapped but not yet locked.
2765 * We return with mmap_sem still held, but pte unmapped and unlocked. 2792 * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2772,19 +2799,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2772 spinlock_t *ptl; 2799 spinlock_t *ptl;
2773 pte_t entry; 2800 pte_t entry;
2774 2801
2802 pte_unmap(page_table);
2803
2804 /* Check if we need to add a guard page to the stack */
2805 if (check_stack_guard_page(vma, address) < 0)
2806 return VM_FAULT_SIGBUS;
2807
2808 /* Use the zero-page for reads */
2775 if (!(flags & FAULT_FLAG_WRITE)) { 2809 if (!(flags & FAULT_FLAG_WRITE)) {
2776 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2810 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2777 vma->vm_page_prot)); 2811 vma->vm_page_prot));
2778 ptl = pte_lockptr(mm, pmd); 2812 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2779 spin_lock(ptl);
2780 if (!pte_none(*page_table)) 2813 if (!pte_none(*page_table))
2781 goto unlock; 2814 goto unlock;
2782 goto setpte; 2815 goto setpte;
2783 } 2816 }
2784 2817
2785 /* Allocate our own private page. */ 2818 /* Allocate our own private page. */
2786 pte_unmap(page_table);
2787
2788 if (unlikely(anon_vma_prepare(vma))) 2819 if (unlikely(anon_vma_prepare(vma)))
2789 goto oom; 2820 goto oom;
2790 page = alloc_zeroed_user_highpage_movable(vma, address); 2821 page = alloc_zeroed_user_highpage_movable(vma, address);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..f969da5dd8a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1275,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1275 const unsigned long __user *, new_nodes) 1275 const unsigned long __user *, new_nodes)
1276{ 1276{
1277 const struct cred *cred = current_cred(), *tcred; 1277 const struct cred *cred = current_cred(), *tcred;
1278 struct mm_struct *mm; 1278 struct mm_struct *mm = NULL;
1279 struct task_struct *task; 1279 struct task_struct *task;
1280 nodemask_t old;
1281 nodemask_t new;
1282 nodemask_t task_nodes; 1280 nodemask_t task_nodes;
1283 int err; 1281 int err;
1282 nodemask_t *old;
1283 nodemask_t *new;
1284 NODEMASK_SCRATCH(scratch);
1285
1286 if (!scratch)
1287 return -ENOMEM;
1288
1289 old = &scratch->mask1;
1290 new = &scratch->mask2;
1284 1291
1285 err = get_nodes(&old, old_nodes, maxnode); 1292 err = get_nodes(old, old_nodes, maxnode);
1286 if (err) 1293 if (err)
1287 return err; 1294 goto out;
1288 1295
1289 err = get_nodes(&new, new_nodes, maxnode); 1296 err = get_nodes(new, new_nodes, maxnode);
1290 if (err) 1297 if (err)
1291 return err; 1298 goto out;
1292 1299
1293 /* Find the mm_struct */ 1300 /* Find the mm_struct */
1294 read_lock(&tasklist_lock); 1301 read_lock(&tasklist_lock);
1295 task = pid ? find_task_by_vpid(pid) : current; 1302 task = pid ? find_task_by_vpid(pid) : current;
1296 if (!task) { 1303 if (!task) {
1297 read_unlock(&tasklist_lock); 1304 read_unlock(&tasklist_lock);
1298 return -ESRCH; 1305 err = -ESRCH;
1306 goto out;
1299 } 1307 }
1300 mm = get_task_mm(task); 1308 mm = get_task_mm(task);
1301 read_unlock(&tasklist_lock); 1309 read_unlock(&tasklist_lock);
1302 1310
1311 err = -EINVAL;
1303 if (!mm) 1312 if (!mm)
1304 return -EINVAL; 1313 goto out;
1305 1314
1306 /* 1315 /*
1307 * Check if this process has the right to modify the specified 1316 * Check if this process has the right to modify the specified
@@ -1322,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1322 1331
1323 task_nodes = cpuset_mems_allowed(task); 1332 task_nodes = cpuset_mems_allowed(task);
1324 /* Is the user allowed to access the target nodes? */ 1333 /* Is the user allowed to access the target nodes? */
1325 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { 1334 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1326 err = -EPERM; 1335 err = -EPERM;
1327 goto out; 1336 goto out;
1328 } 1337 }
1329 1338
1330 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { 1339 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1331 err = -EINVAL; 1340 err = -EINVAL;
1332 goto out; 1341 goto out;
1333 } 1342 }
@@ -1336,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1336 if (err) 1345 if (err)
1337 goto out; 1346 goto out;
1338 1347
1339 err = do_migrate_pages(mm, &old, &new, 1348 err = do_migrate_pages(mm, old, new,
1340 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1349 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1341out: 1350out:
1342 mmput(mm); 1351 if (mm)
1352 mmput(mm);
1353 NODEMASK_SCRATCH_FREE(scratch);
1354
1343 return err; 1355 return err;
1344} 1356}
1345 1357
@@ -1712,6 +1724,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1712} 1724}
1713#endif 1725#endif
1714 1726
1727/*
1728 * mempolicy_nodemask_intersects
1729 *
1730 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1731 * policy. Otherwise, check for intersection between mask and the policy
1732 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1733 * policy, always return true since it may allocate elsewhere on fallback.
1734 *
1735 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1736 */
1737bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1738 const nodemask_t *mask)
1739{
1740 struct mempolicy *mempolicy;
1741 bool ret = true;
1742
1743 if (!mask)
1744 return ret;
1745 task_lock(tsk);
1746 mempolicy = tsk->mempolicy;
1747 if (!mempolicy)
1748 goto out;
1749
1750 switch (mempolicy->mode) {
1751 case MPOL_PREFERRED:
1752 /*
1753 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1754 * allocate from, they may fallback to other nodes when oom.
1755 * Thus, it's possible for tsk to have allocated memory from
1756 * nodes in mask.
1757 */
1758 break;
1759 case MPOL_BIND:
1760 case MPOL_INTERLEAVE:
1761 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1762 break;
1763 default:
1764 BUG();
1765 }
1766out:
1767 task_unlock(tsk);
1768 return ret;
1769}
1770
1715/* Allocate a page in interleaved policy. 1771/* Allocate a page in interleaved policy.
1716 Own path because it needs to do special accounting. */ 1772 Own path because it needs to do special accounting. */
1717static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1773static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..38e7cad782f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639 * exist when the page is remapped later 639 * exist when the page is remapped later
640 */ 640 */
641 anon_vma = page_anon_vma(page); 641 anon_vma = page_anon_vma(page);
642 atomic_inc(&anon_vma->external_refcount); 642 get_anon_vma(anon_vma);
643 } 643 }
644 } 644 }
645 645
@@ -682,12 +682,8 @@ skip_unmap:
682rcu_unlock: 682rcu_unlock:
683 683
684 /* Drop an anon_vma reference if we took one */ 684 /* Drop an anon_vma reference if we took one */
685 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { 685 if (anon_vma)
686 int empty = list_empty(&anon_vma->head); 686 drop_anon_vma(anon_vma);
687 spin_unlock(&anon_vma->lock);
688 if (empty)
689 anon_vma_free(anon_vma);
690 }
691 687
692 if (rcu_locked) 688 if (rcu_locked)
693 rcu_read_unlock(); 689 rcu_read_unlock();
diff --git a/mm/mlock.c b/mm/mlock.c
index 3f82720e0515..cbae7c5b9568 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,6 +135,19 @@ void munlock_vma_page(struct page *page)
135 } 135 }
136} 136}
137 137
138/* Is the vma a continuation of the stack vma above it? */
139static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr)
140{
141 return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
142}
143
144static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
145{
146 return (vma->vm_flags & VM_GROWSDOWN) &&
147 (vma->vm_start == addr) &&
148 !vma_stack_continue(vma->vm_prev, addr);
149}
150
138/** 151/**
139 * __mlock_vma_pages_range() - mlock a range of pages in the vma. 152 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
140 * @vma: target vma 153 * @vma: target vma
@@ -167,6 +180,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
167 if (vma->vm_flags & VM_WRITE) 180 if (vma->vm_flags & VM_WRITE)
168 gup_flags |= FOLL_WRITE; 181 gup_flags |= FOLL_WRITE;
169 182
183 /* We don't try to access the guard page of a stack vma */
184 if (stack_guard_page(vma, start)) {
185 addr += PAGE_SIZE;
186 nr_pages--;
187 }
188
170 while (nr_pages > 0) { 189 while (nr_pages > 0) {
171 int i; 190 int i;
172 191
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb756..331e51af38c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -388,17 +388,23 @@ static inline void
388__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 388__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
389 struct vm_area_struct *prev, struct rb_node *rb_parent) 389 struct vm_area_struct *prev, struct rb_node *rb_parent)
390{ 390{
391 struct vm_area_struct *next;
392
393 vma->vm_prev = prev;
391 if (prev) { 394 if (prev) {
392 vma->vm_next = prev->vm_next; 395 next = prev->vm_next;
393 prev->vm_next = vma; 396 prev->vm_next = vma;
394 } else { 397 } else {
395 mm->mmap = vma; 398 mm->mmap = vma;
396 if (rb_parent) 399 if (rb_parent)
397 vma->vm_next = rb_entry(rb_parent, 400 next = rb_entry(rb_parent,
398 struct vm_area_struct, vm_rb); 401 struct vm_area_struct, vm_rb);
399 else 402 else
400 vma->vm_next = NULL; 403 next = NULL;
401 } 404 }
405 vma->vm_next = next;
406 if (next)
407 next->vm_prev = vma;
402} 408}
403 409
404void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 410void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -452,12 +458,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
452 spin_lock(&mapping->i_mmap_lock); 458 spin_lock(&mapping->i_mmap_lock);
453 vma->vm_truncate_count = mapping->truncate_count; 459 vma->vm_truncate_count = mapping->truncate_count;
454 } 460 }
455 anon_vma_lock(vma);
456 461
457 __vma_link(mm, vma, prev, rb_link, rb_parent); 462 __vma_link(mm, vma, prev, rb_link, rb_parent);
458 __vma_link_file(vma); 463 __vma_link_file(vma);
459 464
460 anon_vma_unlock(vma);
461 if (mapping) 465 if (mapping)
462 spin_unlock(&mapping->i_mmap_lock); 466 spin_unlock(&mapping->i_mmap_lock);
463 467
@@ -485,7 +489,11 @@ static inline void
485__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 489__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
486 struct vm_area_struct *prev) 490 struct vm_area_struct *prev)
487{ 491{
488 prev->vm_next = vma->vm_next; 492 struct vm_area_struct *next = vma->vm_next;
493
494 prev->vm_next = next;
495 if (next)
496 next->vm_prev = prev;
489 rb_erase(&vma->vm_rb, &mm->mm_rb); 497 rb_erase(&vma->vm_rb, &mm->mm_rb);
490 if (mm->mmap_cache == vma) 498 if (mm->mmap_cache == vma)
491 mm->mmap_cache = prev; 499 mm->mmap_cache = prev;
@@ -506,6 +514,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
506 struct vm_area_struct *importer = NULL; 514 struct vm_area_struct *importer = NULL;
507 struct address_space *mapping = NULL; 515 struct address_space *mapping = NULL;
508 struct prio_tree_root *root = NULL; 516 struct prio_tree_root *root = NULL;
517 struct anon_vma *anon_vma = NULL;
509 struct file *file = vma->vm_file; 518 struct file *file = vma->vm_file;
510 long adjust_next = 0; 519 long adjust_next = 0;
511 int remove_next = 0; 520 int remove_next = 0;
@@ -578,6 +587,17 @@ again: remove_next = 1 + (end > next->vm_end);
578 } 587 }
579 } 588 }
580 589
590 /*
591 * When changing only vma->vm_end, we don't really need anon_vma
592 * lock. This is a fairly rare case by itself, but the anon_vma
593 * lock may be shared between many sibling processes. Skipping
594 * the lock for brk adjustments makes a difference sometimes.
595 */
596 if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
597 anon_vma = vma->anon_vma;
598 anon_vma_lock(anon_vma);
599 }
600
581 if (root) { 601 if (root) {
582 flush_dcache_mmap_lock(mapping); 602 flush_dcache_mmap_lock(mapping);
583 vma_prio_tree_remove(vma, root); 603 vma_prio_tree_remove(vma, root);
@@ -617,6 +637,8 @@ again: remove_next = 1 + (end > next->vm_end);
617 __insert_vm_struct(mm, insert); 637 __insert_vm_struct(mm, insert);
618 } 638 }
619 639
640 if (anon_vma)
641 anon_vma_unlock(anon_vma);
620 if (mapping) 642 if (mapping)
621 spin_unlock(&mapping->i_mmap_lock); 643 spin_unlock(&mapping->i_mmap_lock);
622 644
@@ -1710,7 +1732,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1710 */ 1732 */
1711 if (unlikely(anon_vma_prepare(vma))) 1733 if (unlikely(anon_vma_prepare(vma)))
1712 return -ENOMEM; 1734 return -ENOMEM;
1713 anon_vma_lock(vma); 1735 vma_lock_anon_vma(vma);
1714 1736
1715 /* 1737 /*
1716 * vma->vm_start/vm_end cannot change under us because the caller 1738 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1743,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1721 if (address < PAGE_ALIGN(address+4)) 1743 if (address < PAGE_ALIGN(address+4))
1722 address = PAGE_ALIGN(address+4); 1744 address = PAGE_ALIGN(address+4);
1723 else { 1745 else {
1724 anon_vma_unlock(vma); 1746 vma_unlock_anon_vma(vma);
1725 return -ENOMEM; 1747 return -ENOMEM;
1726 } 1748 }
1727 error = 0; 1749 error = 0;
@@ -1739,7 +1761,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1739 perf_event_mmap(vma); 1761 perf_event_mmap(vma);
1740 } 1762 }
1741 } 1763 }
1742 anon_vma_unlock(vma); 1764 vma_unlock_anon_vma(vma);
1743 return error; 1765 return error;
1744} 1766}
1745#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1767#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1786,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1764 if (error) 1786 if (error)
1765 return error; 1787 return error;
1766 1788
1767 anon_vma_lock(vma); 1789 vma_lock_anon_vma(vma);
1768 1790
1769 /* 1791 /*
1770 * vma->vm_start/vm_end cannot change under us because the caller 1792 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1808,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1786 perf_event_mmap(vma); 1808 perf_event_mmap(vma);
1787 } 1809 }
1788 } 1810 }
1789 anon_vma_unlock(vma); 1811 vma_unlock_anon_vma(vma);
1790 return error; 1812 return error;
1791} 1813}
1792 1814
@@ -1903,6 +1925,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1903 unsigned long addr; 1925 unsigned long addr;
1904 1926
1905 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 1927 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1928 vma->vm_prev = NULL;
1906 do { 1929 do {
1907 rb_erase(&vma->vm_rb, &mm->mm_rb); 1930 rb_erase(&vma->vm_rb, &mm->mm_rb);
1908 mm->map_count--; 1931 mm->map_count--;
@@ -1910,6 +1933,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1910 vma = vma->vm_next; 1933 vma = vma->vm_next;
1911 } while (vma && vma->vm_start < end); 1934 } while (vma && vma->vm_start < end);
1912 *insertion_point = vma; 1935 *insertion_point = vma;
1936 if (vma)
1937 vma->vm_prev = prev;
1913 tail_vma->vm_next = NULL; 1938 tail_vma->vm_next = NULL;
1914 if (mm->unmap_area == arch_unmap_area) 1939 if (mm->unmap_area == arch_unmap_area)
1915 addr = prev ? prev->vm_end : mm->mmap_base; 1940 addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2470,23 +2495,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
2470 2495
2471static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2496static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2472{ 2497{
2473 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { 2498 if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2474 /* 2499 /*
2475 * The LSB of head.next can't change from under us 2500 * The LSB of head.next can't change from under us
2476 * because we hold the mm_all_locks_mutex. 2501 * because we hold the mm_all_locks_mutex.
2477 */ 2502 */
2478 spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); 2503 spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
2479 /* 2504 /*
2480 * We can safely modify head.next after taking the 2505 * We can safely modify head.next after taking the
2481 * anon_vma->lock. If some other vma in this mm shares 2506 * anon_vma->root->lock. If some other vma in this mm shares
2482 * the same anon_vma we won't take it again. 2507 * the same anon_vma we won't take it again.
2483 * 2508 *
2484 * No need of atomic instructions here, head.next 2509 * No need of atomic instructions here, head.next
2485 * can't change from under us thanks to the 2510 * can't change from under us thanks to the
2486 * anon_vma->lock. 2511 * anon_vma->root->lock.
2487 */ 2512 */
2488 if (__test_and_set_bit(0, (unsigned long *) 2513 if (__test_and_set_bit(0, (unsigned long *)
2489 &anon_vma->head.next)) 2514 &anon_vma->root->head.next))
2490 BUG(); 2515 BUG();
2491 } 2516 }
2492} 2517}
@@ -2577,7 +2602,7 @@ out_unlock:
2577 2602
2578static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2603static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2579{ 2604{
2580 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { 2605 if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2581 /* 2606 /*
2582 * The LSB of head.next can't change to 0 from under 2607 * The LSB of head.next can't change to 0 from under
2583 * us because we hold the mm_all_locks_mutex. 2608 * us because we hold the mm_all_locks_mutex.
@@ -2588,12 +2613,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2588 * 2613 *
2589 * No need of atomic instructions here, head.next 2614 * No need of atomic instructions here, head.next
2590 * can't change from under us until we release the 2615 * can't change from under us until we release the
2591 * anon_vma->lock. 2616 * anon_vma->root->lock.
2592 */ 2617 */
2593 if (!__test_and_clear_bit(0, (unsigned long *) 2618 if (!__test_and_clear_bit(0, (unsigned long *)
2594 &anon_vma->head.next)) 2619 &anon_vma->root->head.next))
2595 BUG(); 2620 BUG();
2596 spin_unlock(&anon_vma->lock); 2621 anon_vma_unlock(anon_vma);
2597 } 2622 }
2598} 2623}
2599 2624
diff --git a/mm/nommu.c b/mm/nommu.c
index b76f3ee0abe0..88ff091eb07a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -36,11 +36,6 @@
36#include <asm/mmu_context.h> 36#include <asm/mmu_context.h>
37#include "internal.h" 37#include "internal.h"
38 38
39static inline __attribute__((format(printf, 1, 2)))
40void no_printk(const char *fmt, ...)
41{
42}
43
44#if 0 39#if 0
45#define kenter(FMT, ...) \ 40#define kenter(FMT, ...) \
46 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) 41 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -609,7 +604,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
609 */ 604 */
610static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) 605static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
611{ 606{
612 struct vm_area_struct *pvma, **pp; 607 struct vm_area_struct *pvma, **pp, *next;
613 struct address_space *mapping; 608 struct address_space *mapping;
614 struct rb_node **p, *parent; 609 struct rb_node **p, *parent;
615 610
@@ -669,8 +664,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
669 break; 664 break;
670 } 665 }
671 666
672 vma->vm_next = *pp; 667 next = *pp;
673 *pp = vma; 668 *pp = vma;
669 vma->vm_next = next;
670 if (next)
671 next->vm_prev = vma;
674} 672}
675 673
676/* 674/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 709aedfaa014..fc81cb22869e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
4 * Copyright (C) 1998,2000 Rik van Riel 4 * Copyright (C) 1998,2000 Rik van Riel
5 * Thanks go out to Claus Fischer for some serious inspiration and 5 * Thanks go out to Claus Fischer for some serious inspiration and
6 * for goading me into coding this file... 6 * for goading me into coding this file...
7 * Copyright (C) 2010 Google, Inc.
8 * Rewritten by David Rientjes
7 * 9 *
8 * The routines in this file are used to kill a process when 10 * The routines in this file are used to kill a process when
9 * we're seriously out of memory. This gets called from __alloc_pages() 11 * we're seriously out of memory. This gets called from __alloc_pages()
@@ -27,171 +29,188 @@
27#include <linux/module.h> 29#include <linux/module.h>
28#include <linux/notifier.h> 30#include <linux/notifier.h>
29#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h>
30#include <linux/security.h> 33#include <linux/security.h>
31 34
32int sysctl_panic_on_oom; 35int sysctl_panic_on_oom;
33int sysctl_oom_kill_allocating_task; 36int sysctl_oom_kill_allocating_task;
34int sysctl_oom_dump_tasks; 37int sysctl_oom_dump_tasks = 1;
35static DEFINE_SPINLOCK(zone_scan_lock); 38static DEFINE_SPINLOCK(zone_scan_lock);
36/* #define DEBUG */ 39
40#ifdef CONFIG_NUMA
41/**
42 * has_intersects_mems_allowed() - check task eligiblity for kill
43 * @tsk: task struct of which task to consider
44 * @mask: nodemask passed to page allocator for mempolicy ooms
45 *
46 * Task eligibility is determined by whether or not a candidate task, @tsk,
47 * shares the same mempolicy nodes as current if it is bound by such a policy
48 * and whether or not it has the same set of allowed cpuset nodes.
49 */
50static bool has_intersects_mems_allowed(struct task_struct *tsk,
51 const nodemask_t *mask)
52{
53 struct task_struct *start = tsk;
54
55 do {
56 if (mask) {
57 /*
58 * If this is a mempolicy constrained oom, tsk's
59 * cpuset is irrelevant. Only return true if its
60 * mempolicy intersects current, otherwise it may be
61 * needlessly killed.
62 */
63 if (mempolicy_nodemask_intersects(tsk, mask))
64 return true;
65 } else {
66 /*
67 * This is not a mempolicy constrained oom, so only
68 * check the mems of tsk's cpuset.
69 */
70 if (cpuset_mems_allowed_intersects(current, tsk))
71 return true;
72 }
73 } while_each_thread(start, tsk);
74
75 return false;
76}
77#else
78static bool has_intersects_mems_allowed(struct task_struct *tsk,
79 const nodemask_t *mask)
80{
81 return true;
82}
83#endif /* CONFIG_NUMA */
37 84
38/* 85/*
39 * Is all threads of the target process nodes overlap ours? 86 * If this is a system OOM (not a memcg OOM) and the task selected to be
87 * killed is not already running at high (RT) priorities, speed up the
88 * recovery by boosting the dying task to the lowest FIFO priority.
89 * That helps with the recovery and avoids interfering with RT tasks.
40 */ 90 */
41static int has_intersects_mems_allowed(struct task_struct *tsk) 91static void boost_dying_task_prio(struct task_struct *p,
92 struct mem_cgroup *mem)
42{ 93{
43 struct task_struct *t; 94 struct sched_param param = { .sched_priority = 1 };
95
96 if (mem)
97 return;
98
99 if (!rt_task(p))
100 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
101}
102
103/*
104 * The process p may have detached its own ->mm while exiting or through
105 * use_mm(), but one or more of its subthreads may still have a valid
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with
107 * task_lock() held.
108 */
109struct task_struct *find_lock_task_mm(struct task_struct *p)
110{
111 struct task_struct *t = p;
44 112
45 t = tsk;
46 do { 113 do {
47 if (cpuset_mems_allowed_intersects(current, t)) 114 task_lock(t);
48 return 1; 115 if (likely(t->mm))
49 t = next_thread(t); 116 return t;
50 } while (t != tsk); 117 task_unlock(t);
118 } while_each_thread(p, t);
51 119
52 return 0; 120 return NULL;
121}
122
123/* return true if the task is not adequate as candidate victim task. */
124static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
125 const nodemask_t *nodemask)
126{
127 if (is_global_init(p))
128 return true;
129 if (p->flags & PF_KTHREAD)
130 return true;
131
132 /* When mem_cgroup_out_of_memory() and p is not member of the group */
133 if (mem && !task_in_mem_cgroup(p, mem))
134 return true;
135
136 /* p may not have freeable memory in nodemask */
137 if (!has_intersects_mems_allowed(p, nodemask))
138 return true;
139
140 return false;
53} 141}
54 142
55/** 143/**
56 * badness - calculate a numeric value for how bad this task has been 144 * oom_badness - heuristic function to determine which candidate task to kill
57 * @p: task struct of which task we should calculate 145 * @p: task struct of which task we should calculate
58 * @uptime: current uptime in seconds 146 * @totalpages: total present RAM allowed for page allocation
59 *
60 * The formula used is relatively simple and documented inline in the
61 * function. The main rationale is that we want to select a good task
62 * to kill when we run out of memory.
63 * 147 *
64 * Good in this context means that: 148 * The heuristic for determining which task to kill is made to be as simple and
65 * 1) we lose the minimum amount of work done 149 * predictable as possible. The goal is to return the highest value for the
66 * 2) we recover a large amount of memory 150 * task consuming the most memory to avoid subsequent oom failures.
67 * 3) we don't kill anything innocent of eating tons of memory
68 * 4) we want to kill the minimum amount of processes (one)
69 * 5) we try to kill the process the user expects us to kill, this
70 * algorithm has been meticulously tuned to meet the principle
71 * of least surprise ... (be careful when you change it)
72 */ 151 */
73 152unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
74unsigned long badness(struct task_struct *p, unsigned long uptime) 153 const nodemask_t *nodemask, unsigned long totalpages)
75{ 154{
76 unsigned long points, cpu_time, run_time; 155 int points;
77 struct mm_struct *mm;
78 struct task_struct *child;
79 int oom_adj = p->signal->oom_adj;
80 struct task_cputime task_time;
81 unsigned long utime;
82 unsigned long stime;
83 156
84 if (oom_adj == OOM_DISABLE) 157 if (oom_unkillable_task(p, mem, nodemask))
85 return 0; 158 return 0;
86 159
87 task_lock(p); 160 p = find_lock_task_mm(p);
88 mm = p->mm; 161 if (!p)
89 if (!mm) {
90 task_unlock(p);
91 return 0; 162 return 0;
92 }
93
94 /*
95 * The memory size of the process is the basis for the badness.
96 */
97 points = mm->total_vm;
98 163
99 /* 164 /*
100 * After this unlock we can no longer dereference local variable `mm' 165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
166 * need to be executed for something that cannot be killed.
101 */ 167 */
102 task_unlock(p); 168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
103 169 task_unlock(p);
104 /* 170 return 0;
105 * swapoff can easily use up all memory, so kill those first.
106 */
107 if (p->flags & PF_OOM_ORIGIN)
108 return ULONG_MAX;
109
110 /*
111 * Processes which fork a lot of child processes are likely
112 * a good choice. We add half the vmsize of the children if they
113 * have an own mm. This prevents forking servers to flood the
114 * machine with an endless amount of children. In case a single
115 * child is eating the vast majority of memory, adding only half
116 * to the parents will make the child our kill candidate of choice.
117 */
118 list_for_each_entry(child, &p->children, sibling) {
119 task_lock(child);
120 if (child->mm != mm && child->mm)
121 points += child->mm->total_vm/2 + 1;
122 task_unlock(child);
123 } 171 }
124 172
125 /* 173 /*
126 * CPU time is in tens of seconds and run time is in thousands 174 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
127 * of seconds. There is no particular reason for this other than 175 * priority for oom killing.
128 * that it turned out to work very well in practice.
129 */
130 thread_group_cputime(p, &task_time);
131 utime = cputime_to_jiffies(task_time.utime);
132 stime = cputime_to_jiffies(task_time.stime);
133 cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
134
135
136 if (uptime >= p->start_time.tv_sec)
137 run_time = (uptime - p->start_time.tv_sec) >> 10;
138 else
139 run_time = 0;
140
141 if (cpu_time)
142 points /= int_sqrt(cpu_time);
143 if (run_time)
144 points /= int_sqrt(int_sqrt(run_time));
145
146 /*
147 * Niced processes are most likely less important, so double
148 * their badness points.
149 */ 176 */
150 if (task_nice(p) > 0) 177 if (p->flags & PF_OOM_ORIGIN) {
151 points *= 2; 178 task_unlock(p);
179 return 1000;
180 }
152 181
153 /* 182 /*
154 * Superuser processes are usually more important, so we make it 183 * The memory controller may have a limit of 0 bytes, so avoid a divide
155 * less likely that we kill those. 184 * by zero, if necessary.
156 */ 185 */
157 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 186 if (!totalpages)
158 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 187 totalpages = 1;
159 points /= 4;
160 188
161 /* 189 /*
162 * We don't want to kill a process with direct hardware access. 190 * The baseline for the badness score is the proportion of RAM that each
163 * Not only could that mess up the hardware, but usually users 191 * task's rss and swap space use.
164 * tend to only have this flag set on applications they think
165 * of as important.
166 */ 192 */
167 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 193 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
168 points /= 4; 194 totalpages;
195 task_unlock(p);
169 196
170 /* 197 /*
171 * If p's nodes don't overlap ours, it may still help to kill p 198 * Root processes get 3% bonus, just like the __vm_enough_memory()
172 * because p may have allocated or otherwise mapped memory on 199 * implementation used by LSMs.
173 * this node before. However it will be less likely.
174 */ 200 */
175 if (!has_intersects_mems_allowed(p)) 201 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
176 points /= 8; 202 points -= 30;
177 203
178 /* 204 /*
179 * Adjust the score by oom_adj. 205 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
206 * either completely disable oom killing or always prefer a certain
207 * task.
180 */ 208 */
181 if (oom_adj) { 209 points += p->signal->oom_score_adj;
182 if (oom_adj > 0) {
183 if (!points)
184 points = 1;
185 points <<= oom_adj;
186 } else
187 points >>= -(oom_adj);
188 }
189 210
190#ifdef DEBUG 211 if (points < 0)
191 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", 212 return 0;
192 p->pid, p->comm, points); 213 return (points < 1000) ? points : 1000;
193#endif
194 return points;
195} 214}
196 215
197/* 216/*
@@ -199,12 +218,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
199 */ 218 */
200#ifdef CONFIG_NUMA 219#ifdef CONFIG_NUMA
201static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 220static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
202 gfp_t gfp_mask, nodemask_t *nodemask) 221 gfp_t gfp_mask, nodemask_t *nodemask,
222 unsigned long *totalpages)
203{ 223{
204 struct zone *zone; 224 struct zone *zone;
205 struct zoneref *z; 225 struct zoneref *z;
206 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 226 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
227 bool cpuset_limited = false;
228 int nid;
229
230 /* Default to all available memory */
231 *totalpages = totalram_pages + total_swap_pages;
207 232
233 if (!zonelist)
234 return CONSTRAINT_NONE;
208 /* 235 /*
209 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 236 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
210 * to kill current.We have to random task kill in this case. 237 * to kill current.We have to random task kill in this case.
@@ -214,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
214 return CONSTRAINT_NONE; 241 return CONSTRAINT_NONE;
215 242
216 /* 243 /*
217 * The nodemask here is a nodemask passed to alloc_pages(). Now, 244 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
218 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy 245 * the page allocator means a mempolicy is in effect. Cpuset policy
219 * feature. mempolicy is an only user of nodemask here. 246 * is enforced in get_page_from_freelist().
220 * check mempolicy's nodemask contains all N_HIGH_MEMORY
221 */ 247 */
222 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) 248 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
249 *totalpages = total_swap_pages;
250 for_each_node_mask(nid, *nodemask)
251 *totalpages += node_spanned_pages(nid);
223 return CONSTRAINT_MEMORY_POLICY; 252 return CONSTRAINT_MEMORY_POLICY;
253 }
224 254
225 /* Check this allocation failure is caused by cpuset's wall function */ 255 /* Check this allocation failure is caused by cpuset's wall function */
226 for_each_zone_zonelist_nodemask(zone, z, zonelist, 256 for_each_zone_zonelist_nodemask(zone, z, zonelist,
227 high_zoneidx, nodemask) 257 high_zoneidx, nodemask)
228 if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) 258 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
229 return CONSTRAINT_CPUSET; 259 cpuset_limited = true;
230 260
261 if (cpuset_limited) {
262 *totalpages = total_swap_pages;
263 for_each_node_mask(nid, cpuset_current_mems_allowed)
264 *totalpages += node_spanned_pages(nid);
265 return CONSTRAINT_CPUSET;
266 }
231 return CONSTRAINT_NONE; 267 return CONSTRAINT_NONE;
232} 268}
233#else 269#else
234static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 270static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
235 gfp_t gfp_mask, nodemask_t *nodemask) 271 gfp_t gfp_mask, nodemask_t *nodemask,
272 unsigned long *totalpages)
236{ 273{
274 *totalpages = totalram_pages + total_swap_pages;
237 return CONSTRAINT_NONE; 275 return CONSTRAINT_NONE;
238} 276}
239#endif 277#endif
@@ -244,28 +282,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
244 * 282 *
245 * (not docbooked, we don't want this one cluttering up the manual) 283 * (not docbooked, we don't want this one cluttering up the manual)
246 */ 284 */
247static struct task_struct *select_bad_process(unsigned long *ppoints, 285static struct task_struct *select_bad_process(unsigned int *ppoints,
248 struct mem_cgroup *mem) 286 unsigned long totalpages, struct mem_cgroup *mem,
287 const nodemask_t *nodemask)
249{ 288{
250 struct task_struct *p; 289 struct task_struct *p;
251 struct task_struct *chosen = NULL; 290 struct task_struct *chosen = NULL;
252 struct timespec uptime;
253 *ppoints = 0; 291 *ppoints = 0;
254 292
255 do_posix_clock_monotonic_gettime(&uptime);
256 for_each_process(p) { 293 for_each_process(p) {
257 unsigned long points; 294 unsigned int points;
258 295
259 /* 296 if (oom_unkillable_task(p, mem, nodemask))
260 * skip kernel threads and tasks which have already released
261 * their mm.
262 */
263 if (!p->mm)
264 continue;
265 /* skip the init task */
266 if (is_global_init(p))
267 continue;
268 if (mem && !task_in_mem_cgroup(p, mem))
269 continue; 297 continue;
270 298
271 /* 299 /*
@@ -290,19 +318,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
290 * the process of exiting and releasing its resources. 318 * the process of exiting and releasing its resources.
291 * Otherwise we could get an easy OOM deadlock. 319 * Otherwise we could get an easy OOM deadlock.
292 */ 320 */
293 if (p->flags & PF_EXITING) { 321 if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
294 if (p != current) 322 if (p != current)
295 return ERR_PTR(-1UL); 323 return ERR_PTR(-1UL);
296 324
297 chosen = p; 325 chosen = p;
298 *ppoints = ULONG_MAX; 326 *ppoints = 1000;
299 } 327 }
300 328
301 if (p->signal->oom_adj == OOM_DISABLE) 329 points = oom_badness(p, mem, nodemask, totalpages);
302 continue; 330 if (points > *ppoints) {
303
304 points = badness(p, uptime.tv_sec);
305 if (points > *ppoints || !chosen) {
306 chosen = p; 331 chosen = p;
307 *ppoints = points; 332 *ppoints = points;
308 } 333 }
@@ -313,11 +338,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
313 338
314/** 339/**
315 * dump_tasks - dump current memory state of all system tasks 340 * dump_tasks - dump current memory state of all system tasks
316 * @mem: target memory controller 341 * @mem: current's memory controller, if constrained
317 * 342 *
318 * Dumps the current memory state of all system tasks, excluding kernel threads. 343 * Dumps the current memory state of all system tasks, excluding kernel threads.
319 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 344 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
320 * score, and name. 345 * value, oom_score_adj value, and name.
321 * 346 *
322 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are 347 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
323 * shown. 348 * shown.
@@ -326,44 +351,43 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
326 */ 351 */
327static void dump_tasks(const struct mem_cgroup *mem) 352static void dump_tasks(const struct mem_cgroup *mem)
328{ 353{
329 struct task_struct *g, *p; 354 struct task_struct *p;
330 355 struct task_struct *task;
331 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
332 "name\n");
333 do_each_thread(g, p) {
334 struct mm_struct *mm;
335 356
336 if (mem && !task_in_mem_cgroup(p, mem)) 357 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
358 for_each_process(p) {
359 if (p->flags & PF_KTHREAD)
337 continue; 360 continue;
338 if (!thread_group_leader(p)) 361 if (mem && !task_in_mem_cgroup(p, mem))
339 continue; 362 continue;
340 363
341 task_lock(p); 364 task = find_lock_task_mm(p);
342 mm = p->mm; 365 if (!task) {
343 if (!mm) {
344 /* 366 /*
345 * total_vm and rss sizes do not exist for tasks with no 367 * This is a kthread or all of p's threads have already
346 * mm so there's no need to report them; they can't be 368 * detached their mm's. There's no need to report
347 * oom killed anyway. 369 * them; they can't be oom killed anyway.
348 */ 370 */
349 task_unlock(p);
350 continue; 371 continue;
351 } 372 }
352 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 373
353 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 374 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
354 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, 375 task->pid, task_uid(task), task->tgid,
355 p->comm); 376 task->mm->total_vm, get_mm_rss(task->mm),
356 task_unlock(p); 377 task_cpu(task), task->signal->oom_adj,
357 } while_each_thread(g, p); 378 task->signal->oom_score_adj, task->comm);
379 task_unlock(task);
380 }
358} 381}
359 382
360static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 383static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
361 struct mem_cgroup *mem) 384 struct mem_cgroup *mem)
362{ 385{
363 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
364 "oom_adj=%d\n",
365 current->comm, gfp_mask, order, current->signal->oom_adj);
366 task_lock(current); 386 task_lock(current);
387 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
388 "oom_adj=%d, oom_score_adj=%d\n",
389 current->comm, gfp_mask, order, current->signal->oom_adj,
390 current->signal->oom_score_adj);
367 cpuset_print_task_mems_allowed(current); 391 cpuset_print_task_mems_allowed(current);
368 task_unlock(current); 392 task_unlock(current);
369 dump_stack(); 393 dump_stack();
@@ -374,72 +398,42 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
374} 398}
375 399
376#define K(x) ((x) << (PAGE_SHIFT-10)) 400#define K(x) ((x) << (PAGE_SHIFT-10))
377 401static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
378/*
379 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
380 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
381 * set.
382 */
383static void __oom_kill_task(struct task_struct *p, int verbose)
384{ 402{
385 if (is_global_init(p)) { 403 p = find_lock_task_mm(p);
386 WARN_ON(1); 404 if (!p)
387 printk(KERN_WARNING "tried to kill init!\n"); 405 return 1;
388 return;
389 }
390
391 task_lock(p);
392 if (!p->mm) {
393 WARN_ON(1);
394 printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
395 task_pid_nr(p), p->comm);
396 task_unlock(p);
397 return;
398 }
399 406
400 if (verbose) 407 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
401 printk(KERN_ERR "Killed process %d (%s) " 408 task_pid_nr(p), p->comm, K(p->mm->total_vm),
402 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 409 K(get_mm_counter(p->mm, MM_ANONPAGES)),
403 task_pid_nr(p), p->comm, 410 K(get_mm_counter(p->mm, MM_FILEPAGES)));
404 K(p->mm->total_vm),
405 K(get_mm_counter(p->mm, MM_ANONPAGES)),
406 K(get_mm_counter(p->mm, MM_FILEPAGES)));
407 task_unlock(p); 411 task_unlock(p);
408 412
413
414 set_tsk_thread_flag(p, TIF_MEMDIE);
415 force_sig(SIGKILL, p);
416
409 /* 417 /*
410 * We give our sacrificial lamb high priority and access to 418 * We give our sacrificial lamb high priority and access to
411 * all the memory it needs. That way it should be able to 419 * all the memory it needs. That way it should be able to
412 * exit() and clear out its resources quickly... 420 * exit() and clear out its resources quickly...
413 */ 421 */
414 p->rt.time_slice = HZ; 422 boost_dying_task_prio(p, mem);
415 set_tsk_thread_flag(p, TIF_MEMDIE);
416
417 force_sig(SIGKILL, p);
418}
419
420static int oom_kill_task(struct task_struct *p)
421{
422 /* WARNING: mm may not be dereferenced since we did not obtain its
423 * value from get_task_mm(p). This is OK since all we need to do is
424 * compare mm to q->mm below.
425 *
426 * Furthermore, even if mm contains a non-NULL value, p->mm may
427 * change to NULL at any time since we do not hold task_lock(p).
428 * However, this is of no concern to us.
429 */
430 if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
431 return 1;
432
433 __oom_kill_task(p, 1);
434 423
435 return 0; 424 return 0;
436} 425}
426#undef K
437 427
438static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 428static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
439 unsigned long points, struct mem_cgroup *mem, 429 unsigned int points, unsigned long totalpages,
430 struct mem_cgroup *mem, nodemask_t *nodemask,
440 const char *message) 431 const char *message)
441{ 432{
442 struct task_struct *c; 433 struct task_struct *victim = p;
434 struct task_struct *child;
435 struct task_struct *t = p;
436 unsigned int victim_points = 0;
443 437
444 if (printk_ratelimit()) 438 if (printk_ratelimit())
445 dump_header(p, gfp_mask, order, mem); 439 dump_header(p, gfp_mask, order, mem);
@@ -449,40 +443,81 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
449 * its children or threads, just set TIF_MEMDIE so it can die quickly 443 * its children or threads, just set TIF_MEMDIE so it can die quickly
450 */ 444 */
451 if (p->flags & PF_EXITING) { 445 if (p->flags & PF_EXITING) {
452 __oom_kill_task(p, 0); 446 set_tsk_thread_flag(p, TIF_MEMDIE);
447 boost_dying_task_prio(p, mem);
453 return 0; 448 return 0;
454 } 449 }
455 450
456 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", 451 task_lock(p);
457 message, task_pid_nr(p), p->comm, points); 452 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
453 message, task_pid_nr(p), p->comm, points);
454 task_unlock(p);
458 455
459 /* Try to kill a child first */ 456 /*
460 list_for_each_entry(c, &p->children, sibling) { 457 * If any of p's children has a different mm and is eligible for kill,
461 if (c->mm == p->mm) 458 * the one with the highest badness() score is sacrificed for its
462 continue; 459 * parent. This attempts to lose the minimal amount of work done while
463 if (mem && !task_in_mem_cgroup(c, mem)) 460 * still freeing memory.
464 continue; 461 */
465 if (!oom_kill_task(c)) 462 do {
466 return 0; 463 list_for_each_entry(child, &t->children, sibling) {
464 unsigned int child_points;
465
466 /*
467 * oom_badness() returns 0 if the thread is unkillable
468 */
469 child_points = oom_badness(child, mem, nodemask,
470 totalpages);
471 if (child_points > victim_points) {
472 victim = child;
473 victim_points = child_points;
474 }
475 }
476 } while_each_thread(p, t);
477
478 return oom_kill_task(victim, mem);
479}
480
481/*
482 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
483 */
484static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
485 int order)
486{
487 if (likely(!sysctl_panic_on_oom))
488 return;
489 if (sysctl_panic_on_oom != 2) {
490 /*
491 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
492 * does not panic for cpuset, mempolicy, or memcg allocation
493 * failures.
494 */
495 if (constraint != CONSTRAINT_NONE)
496 return;
467 } 497 }
468 return oom_kill_task(p); 498 read_lock(&tasklist_lock);
499 dump_header(NULL, gfp_mask, order, NULL);
500 read_unlock(&tasklist_lock);
501 panic("Out of memory: %s panic_on_oom is enabled\n",
502 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
469} 503}
470 504
471#ifdef CONFIG_CGROUP_MEM_RES_CTLR 505#ifdef CONFIG_CGROUP_MEM_RES_CTLR
472void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 506void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
473{ 507{
474 unsigned long points = 0; 508 unsigned long limit;
509 unsigned int points = 0;
475 struct task_struct *p; 510 struct task_struct *p;
476 511
477 if (sysctl_panic_on_oom == 2) 512 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
478 panic("out of memory(memcg). panic_on_oom is selected.\n"); 513 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
479 read_lock(&tasklist_lock); 514 read_lock(&tasklist_lock);
480retry: 515retry:
481 p = select_bad_process(&points, mem); 516 p = select_bad_process(&points, limit, mem, NULL);
482 if (!p || PTR_ERR(p) == -1UL) 517 if (!p || PTR_ERR(p) == -1UL)
483 goto out; 518 goto out;
484 519
485 if (oom_kill_process(p, gfp_mask, 0, points, mem, 520 if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
486 "Memory cgroup out of memory")) 521 "Memory cgroup out of memory"))
487 goto retry; 522 goto retry;
488out: 523out:
@@ -509,7 +544,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
509 * if a parallel OOM killing is already taking place that includes a zone in 544 * if a parallel OOM killing is already taking place that includes a zone in
510 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 545 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
511 */ 546 */
512int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) 547int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
513{ 548{
514 struct zoneref *z; 549 struct zoneref *z;
515 struct zone *zone; 550 struct zone *zone;
@@ -526,7 +561,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
526 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 561 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
527 /* 562 /*
528 * Lock each zone in the zonelist under zone_scan_lock so a 563 * Lock each zone in the zonelist under zone_scan_lock so a
529 * parallel invocation of try_set_zone_oom() doesn't succeed 564 * parallel invocation of try_set_zonelist_oom() doesn't succeed
530 * when it shouldn't. 565 * when it shouldn't.
531 */ 566 */
532 zone_set_flag(zone, ZONE_OOM_LOCKED); 567 zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -555,65 +590,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
555} 590}
556 591
557/* 592/*
558 * Must be called with tasklist_lock held for read. 593 * Try to acquire the oom killer lock for all system zones. Returns zero if a
594 * parallel oom killing is taking place, otherwise locks all zones and returns
595 * non-zero.
559 */ 596 */
560static void __out_of_memory(gfp_t gfp_mask, int order) 597static int try_set_system_oom(void)
561{ 598{
562 struct task_struct *p; 599 struct zone *zone;
563 unsigned long points; 600 int ret = 1;
564
565 if (sysctl_oom_kill_allocating_task)
566 if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
567 "Out of memory (oom_kill_allocating_task)"))
568 return;
569retry:
570 /*
571 * Rambo mode: Shoot down a process and hope it solves whatever
572 * issues we may have.
573 */
574 p = select_bad_process(&points, NULL);
575
576 if (PTR_ERR(p) == -1UL)
577 return;
578
579 /* Found nothing?!?! Either we hang forever, or we panic. */
580 if (!p) {
581 read_unlock(&tasklist_lock);
582 dump_header(NULL, gfp_mask, order, NULL);
583 panic("Out of memory and no killable processes...\n");
584 }
585 601
586 if (oom_kill_process(p, gfp_mask, order, points, NULL, 602 spin_lock(&zone_scan_lock);
587 "Out of memory")) 603 for_each_populated_zone(zone)
588 goto retry; 604 if (zone_is_oom_locked(zone)) {
605 ret = 0;
606 goto out;
607 }
608 for_each_populated_zone(zone)
609 zone_set_flag(zone, ZONE_OOM_LOCKED);
610out:
611 spin_unlock(&zone_scan_lock);
612 return ret;
589} 613}
590 614
591/* 615/*
592 * pagefault handler calls into here because it is out of memory but 616 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
593 * doesn't know exactly how or why. 617 * attempts or page faults may now recall the oom killer, if necessary.
594 */ 618 */
595void pagefault_out_of_memory(void) 619static void clear_system_oom(void)
596{ 620{
597 unsigned long freed = 0; 621 struct zone *zone;
598
599 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
600 if (freed > 0)
601 /* Got some memory back in the last second. */
602 return;
603
604 if (sysctl_panic_on_oom)
605 panic("out of memory from page fault. panic_on_oom is selected.\n");
606
607 read_lock(&tasklist_lock);
608 __out_of_memory(0, 0); /* unknown gfp_mask and order */
609 read_unlock(&tasklist_lock);
610 622
611 /* 623 spin_lock(&zone_scan_lock);
612 * Give "p" a good chance of killing itself before we 624 for_each_populated_zone(zone)
613 * retry to allocate memory. 625 zone_clear_flag(zone, ZONE_OOM_LOCKED);
614 */ 626 spin_unlock(&zone_scan_lock);
615 if (!test_thread_flag(TIF_MEMDIE))
616 schedule_timeout_uninterruptible(1);
617} 627}
618 628
619/** 629/**
@@ -621,6 +631,7 @@ void pagefault_out_of_memory(void)
621 * @zonelist: zonelist pointer 631 * @zonelist: zonelist pointer
622 * @gfp_mask: memory allocation flags 632 * @gfp_mask: memory allocation flags
623 * @order: amount of memory being requested as a power of 2 633 * @order: amount of memory being requested as a power of 2
634 * @nodemask: nodemask passed to page allocator
624 * 635 *
625 * If we run out of memory, we have the choice between either 636 * If we run out of memory, we have the choice between either
626 * killing a random task (bad), letting the system crash (worse) 637 * killing a random task (bad), letting the system crash (worse)
@@ -630,49 +641,93 @@ void pagefault_out_of_memory(void)
630void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 641void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
631 int order, nodemask_t *nodemask) 642 int order, nodemask_t *nodemask)
632{ 643{
644 struct task_struct *p;
645 unsigned long totalpages;
633 unsigned long freed = 0; 646 unsigned long freed = 0;
634 enum oom_constraint constraint; 647 unsigned int points;
648 enum oom_constraint constraint = CONSTRAINT_NONE;
649 int killed = 0;
635 650
636 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 651 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
637 if (freed > 0) 652 if (freed > 0)
638 /* Got some memory back in the last second. */ 653 /* Got some memory back in the last second. */
639 return; 654 return;
640 655
641 if (sysctl_panic_on_oom == 2) { 656 /*
642 dump_header(NULL, gfp_mask, order, NULL); 657 * If current has a pending SIGKILL, then automatically select it. The
643 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 658 * goal is to allow it to allocate so that it may quickly exit and free
659 * its memory.
660 */
661 if (fatal_signal_pending(current)) {
662 set_thread_flag(TIF_MEMDIE);
663 boost_dying_task_prio(current, NULL);
664 return;
644 } 665 }
645 666
646 /* 667 /*
647 * Check if there were limitations on the allocation (only relevant for 668 * Check if there were limitations on the allocation (only relevant for
648 * NUMA) that may require different handling. 669 * NUMA) that may require different handling.
649 */ 670 */
650 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 671 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
672 &totalpages);
673 check_panic_on_oom(constraint, gfp_mask, order);
674
651 read_lock(&tasklist_lock); 675 read_lock(&tasklist_lock);
676 if (sysctl_oom_kill_allocating_task &&
677 !oom_unkillable_task(current, NULL, nodemask) &&
678 (current->signal->oom_adj != OOM_DISABLE)) {
679 /*
680 * oom_kill_process() needs tasklist_lock held. If it returns
681 * non-zero, current could not be killed so we must fallback to
682 * the tasklist scan.
683 */
684 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
685 NULL, nodemask,
686 "Out of memory (oom_kill_allocating_task)"))
687 goto out;
688 }
652 689
653 switch (constraint) { 690retry:
654 case CONSTRAINT_MEMORY_POLICY: 691 p = select_bad_process(&points, totalpages, NULL,
655 oom_kill_process(current, gfp_mask, order, 0, NULL, 692 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
656 "No available memory (MPOL_BIND)"); 693 NULL);
657 break; 694 if (PTR_ERR(p) == -1UL)
695 goto out;
658 696
659 case CONSTRAINT_NONE: 697 /* Found nothing?!?! Either we hang forever, or we panic. */
660 if (sysctl_panic_on_oom) { 698 if (!p) {
661 dump_header(NULL, gfp_mask, order, NULL); 699 dump_header(NULL, gfp_mask, order, NULL);
662 panic("out of memory. panic_on_oom is selected\n"); 700 read_unlock(&tasklist_lock);
663 } 701 panic("Out of memory and no killable processes...\n");
664 /* Fall-through */
665 case CONSTRAINT_CPUSET:
666 __out_of_memory(gfp_mask, order);
667 break;
668 } 702 }
669 703
704 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
705 nodemask, "Out of memory"))
706 goto retry;
707 killed = 1;
708out:
670 read_unlock(&tasklist_lock); 709 read_unlock(&tasklist_lock);
671 710
672 /* 711 /*
673 * Give "p" a good chance of killing itself before we 712 * Give "p" a good chance of killing itself before we
674 * retry to allocate memory unless "p" is current 713 * retry to allocate memory unless "p" is current
675 */ 714 */
715 if (killed && !test_thread_flag(TIF_MEMDIE))
716 schedule_timeout_uninterruptible(1);
717}
718
719/*
720 * The pagefault handler calls here because it is out of memory, so kill a
721 * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
722 * oom killing is already in progress so do nothing. If a task is found with
723 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
724 */
725void pagefault_out_of_memory(void)
726{
727 if (try_set_system_oom()) {
728 out_of_memory(NULL, 0, 0, NULL);
729 clear_system_oom();
730 }
676 if (!test_thread_flag(TIF_MEMDIE)) 731 if (!test_thread_flag(TIF_MEMDIE))
677 schedule_timeout_uninterruptible(1); 732 schedule_timeout_uninterruptible(1);
678} 733}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..c09ef5219cbe 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/buffer_head.h> 35#include <linux/buffer_head.h>
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <trace/events/writeback.h>
37 38
38/* 39/*
39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
252 } 253 }
253} 254}
254 255
255/*
256 * Clip the earned share of dirty pages to that which is actually available.
257 * This avoids exceeding the total dirty_limit when the floating averages
258 * fluctuate too quickly.
259 */
260static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
261 unsigned long dirty, unsigned long *pbdi_dirty)
262{
263 unsigned long avail_dirty;
264
265 avail_dirty = global_page_state(NR_FILE_DIRTY) +
266 global_page_state(NR_WRITEBACK) +
267 global_page_state(NR_UNSTABLE_NFS) +
268 global_page_state(NR_WRITEBACK_TEMP);
269
270 if (avail_dirty < dirty)
271 avail_dirty = dirty - avail_dirty;
272 else
273 avail_dirty = 0;
274
275 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
276 bdi_stat(bdi, BDI_WRITEBACK);
277
278 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
279}
280
281static inline void task_dirties_fraction(struct task_struct *tsk, 256static inline void task_dirties_fraction(struct task_struct *tsk,
282 long *numerator, long *denominator) 257 long *numerator, long *denominator)
283{ 258{
@@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
286} 261}
287 262
288/* 263/*
289 * scale the dirty limit 264 * task_dirty_limit - scale down dirty throttling threshold for one task
290 * 265 *
291 * task specific dirty limit: 266 * task specific dirty limit:
292 * 267 *
293 * dirty -= (dirty/8) * p_{t} 268 * dirty -= (dirty/8) * p_{t}
269 *
270 * To protect light/slow dirtying tasks from heavier/fast ones, we start
271 * throttling individual tasks before reaching the bdi dirty limit.
272 * Relatively low thresholds will be allocated to heavy dirtiers. So when
273 * dirty pages grow large, heavy dirtiers will be throttled first, which will
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled.
294 */ 276 */
295static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) 277static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty)
296{ 279{
297 long numerator, denominator; 280 long numerator, denominator;
298 unsigned long dirty = *pdirty; 281 unsigned long dirty = bdi_dirty;
299 u64 inv = dirty >> 3; 282 u64 inv = dirty >> 3;
300 283
301 task_dirties_fraction(tsk, &numerator, &denominator); 284 task_dirties_fraction(tsk, &numerator, &denominator);
@@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
303 do_div(inv, denominator); 286 do_div(inv, denominator);
304 287
305 dirty -= inv; 288 dirty -= inv;
306 if (dirty < *pdirty/2)
307 dirty = *pdirty/2;
308 289
309 *pdirty = dirty; 290 return max(dirty, bdi_dirty/2);
310} 291}
311 292
312/* 293/*
@@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
416 return x + 1; /* Ensure that we never return 0 */ 397 return x + 1; /* Ensure that we never return 0 */
417} 398}
418 399
419void 400/*
420get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, 401 * global_dirty_limits - background-writeback and dirty-throttling thresholds
421 unsigned long *pbdi_dirty, struct backing_dev_info *bdi) 402 *
403 * Calculate the dirty thresholds based on sysctl parameters
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * runtime tasks.
408 */
409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
422{ 410{
423 unsigned long background; 411 unsigned long background;
424 unsigned long dirty; 412 unsigned long dirty;
@@ -450,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
450 } 438 }
451 *pbackground = background; 439 *pbackground = background;
452 *pdirty = dirty; 440 *pdirty = dirty;
441}
453 442
454 if (bdi) { 443/*
455 u64 bdi_dirty; 444 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
456 long numerator, denominator; 445 *
446 * Allocate high/low dirty limits to fast/slow devices, in order to prevent
447 * - starving fast devices
448 * - piling up dirty pages (that will take long time to sync) on slow devices
449 *
450 * The bdi's share of dirty limit will be adapting to its throughput and
451 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
452 */
453unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
454{
455 u64 bdi_dirty;
456 long numerator, denominator;
457 457
458 /* 458 /*
459 * Calculate this BDI's share of the dirty ratio. 459 * Calculate this BDI's share of the dirty ratio.
460 */ 460 */
461 bdi_writeout_fraction(bdi, &numerator, &denominator); 461 bdi_writeout_fraction(bdi, &numerator, &denominator);
462 462
463 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; 463 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
464 bdi_dirty *= numerator; 464 bdi_dirty *= numerator;
465 do_div(bdi_dirty, denominator); 465 do_div(bdi_dirty, denominator);
466 bdi_dirty += (dirty * bdi->min_ratio) / 100; 466
467 if (bdi_dirty > (dirty * bdi->max_ratio) / 100) 467 bdi_dirty += (dirty * bdi->min_ratio) / 100;
468 bdi_dirty = dirty * bdi->max_ratio / 100; 468 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
469 469 bdi_dirty = dirty * bdi->max_ratio / 100;
470 *pbdi_dirty = bdi_dirty; 470
471 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); 471 return bdi_dirty;
472 task_dirty_limit(current, pbdi_dirty);
473 }
474} 472}
475 473
476/* 474/*
@@ -490,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
490 unsigned long bdi_thresh; 488 unsigned long bdi_thresh;
491 unsigned long pages_written = 0; 489 unsigned long pages_written = 0;
492 unsigned long pause = 1; 490 unsigned long pause = 1;
493 491 bool dirty_exceeded = false;
494 struct backing_dev_info *bdi = mapping->backing_dev_info; 492 struct backing_dev_info *bdi = mapping->backing_dev_info;
495 493
496 for (;;) { 494 for (;;) {
@@ -501,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
501 .range_cyclic = 1, 499 .range_cyclic = 1,
502 }; 500 };
503 501
504 get_dirty_limits(&background_thresh, &dirty_thresh,
505 &bdi_thresh, bdi);
506
507 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 502 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
508 global_page_state(NR_UNSTABLE_NFS); 503 global_page_state(NR_UNSTABLE_NFS);
509 nr_writeback = global_page_state(NR_WRITEBACK); 504 nr_writeback = global_page_state(NR_WRITEBACK);
510 505
511 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 506 global_dirty_limits(&background_thresh, &dirty_thresh);
512 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
513
514 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
515 break;
516 507
517 /* 508 /*
518 * Throttle it only when the background writeback cannot 509 * Throttle it only when the background writeback cannot
@@ -523,24 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
523 (background_thresh + dirty_thresh) / 2) 514 (background_thresh + dirty_thresh) / 2)
524 break; 515 break;
525 516
526 if (!bdi->dirty_exceeded) 517 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
527 bdi->dirty_exceeded = 1; 518 bdi_thresh = task_dirty_limit(current, bdi_thresh);
528
529 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
530 * Unstable writes are a feature of certain networked
531 * filesystems (i.e. NFS) in which data may have been
532 * written to the server's write cache, but has not yet
533 * been flushed to permanent storage.
534 * Only move pages to writeback if this bdi is over its
535 * threshold otherwise wait until the disk writes catch
536 * up.
537 */
538 if (bdi_nr_reclaimable > bdi_thresh) {
539 writeback_inodes_wb(&bdi->wb, &wbc);
540 pages_written += write_chunk - wbc.nr_to_write;
541 get_dirty_limits(&background_thresh, &dirty_thresh,
542 &bdi_thresh, bdi);
543 }
544 519
545 /* 520 /*
546 * In order to avoid the stacked BDI deadlock we need 521 * In order to avoid the stacked BDI deadlock we need
@@ -555,16 +530,45 @@ static void balance_dirty_pages(struct address_space *mapping,
555 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 530 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
556 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 531 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
557 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 532 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
558 } else if (bdi_nr_reclaimable) { 533 } else {
559 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 534 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
560 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 535 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
561 } 536 }
562 537
563 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) 538 /*
539 * The bdi thresh is somehow "soft" limit derived from the
540 * global "hard" limit. The former helps to prevent heavy IO
541 * bdi or process from holding back light ones; The latter is
542 * the last resort safeguard.
543 */
544 dirty_exceeded =
545 (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
546 || (nr_reclaimable + nr_writeback >= dirty_thresh);
547
548 if (!dirty_exceeded)
564 break; 549 break;
565 if (pages_written >= write_chunk)
566 break; /* We've done our duty */
567 550
551 if (!bdi->dirty_exceeded)
552 bdi->dirty_exceeded = 1;
553
554 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
555 * Unstable writes are a feature of certain networked
556 * filesystems (i.e. NFS) in which data may have been
557 * written to the server's write cache, but has not yet
558 * been flushed to permanent storage.
559 * Only move pages to writeback if this bdi is over its
560 * threshold otherwise wait until the disk writes catch
561 * up.
562 */
563 trace_wbc_balance_dirty_start(&wbc, bdi);
564 if (bdi_nr_reclaimable > bdi_thresh) {
565 writeback_inodes_wb(&bdi->wb, &wbc);
566 pages_written += write_chunk - wbc.nr_to_write;
567 trace_wbc_balance_dirty_written(&wbc, bdi);
568 if (pages_written >= write_chunk)
569 break; /* We've done our duty */
570 }
571 trace_wbc_balance_dirty_wait(&wbc, bdi);
568 __set_current_state(TASK_INTERRUPTIBLE); 572 __set_current_state(TASK_INTERRUPTIBLE);
569 io_schedule_timeout(pause); 573 io_schedule_timeout(pause);
570 574
@@ -577,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
577 pause = HZ / 10; 581 pause = HZ / 10;
578 } 582 }
579 583
580 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 584 if (!dirty_exceeded && bdi->dirty_exceeded)
581 bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 585 bdi->dirty_exceeded = 0;
583 586
584 if (writeback_in_progress(bdi)) 587 if (writeback_in_progress(bdi))
@@ -593,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
593 * background_thresh, to keep the amount of dirty memory low. 596 * background_thresh, to keep the amount of dirty memory low.
594 */ 597 */
595 if ((laptop_mode && pages_written) || 598 if ((laptop_mode && pages_written) ||
596 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 599 (!laptop_mode && (nr_reclaimable > background_thresh)))
597 + global_page_state(NR_UNSTABLE_NFS))
598 > background_thresh)))
599 bdi_start_background_writeback(bdi); 600 bdi_start_background_writeback(bdi);
600} 601}
601 602
@@ -659,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
659 unsigned long dirty_thresh; 660 unsigned long dirty_thresh;
660 661
661 for ( ; ; ) { 662 for ( ; ; ) {
662 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 663 global_dirty_limits(&background_thresh, &dirty_thresh);
663 664
664 /* 665 /*
665 * Boost the allowable dirty threshold a bit for page 666 * Boost the allowable dirty threshold a bit for page
@@ -805,6 +806,42 @@ void __init page_writeback_init(void)
805} 806}
806 807
807/** 808/**
809 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
810 * @mapping: address space structure to write
811 * @start: starting page index
812 * @end: ending page index (inclusive)
813 *
814 * This function scans the page range from @start to @end (inclusive) and tags
815 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
816 * that write_cache_pages (or whoever calls this function) will then use
817 * TOWRITE tag to identify pages eligible for writeback. This mechanism is
818 * used to avoid livelocking of writeback by a process steadily creating new
819 * dirty pages in the file (thus it is important for this function to be quick
820 * so that it can tag pages faster than a dirtying process can create them).
821 */
822/*
823 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
824 */
825void tag_pages_for_writeback(struct address_space *mapping,
826 pgoff_t start, pgoff_t end)
827{
828#define WRITEBACK_TAG_BATCH 4096
829 unsigned long tagged;
830
831 do {
832 spin_lock_irq(&mapping->tree_lock);
833 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
834 &start, end, WRITEBACK_TAG_BATCH,
835 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
836 spin_unlock_irq(&mapping->tree_lock);
837 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
838 cond_resched();
839 /* We check 'start' to handle wrapping when end == ~0UL */
840 } while (tagged >= WRITEBACK_TAG_BATCH && start);
841}
842EXPORT_SYMBOL(tag_pages_for_writeback);
843
844/**
808 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 845 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
809 * @mapping: address space structure to write 846 * @mapping: address space structure to write
810 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 847 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +855,13 @@ void __init page_writeback_init(void)
818 * the call was made get new I/O started against them. If wbc->sync_mode is 855 * the call was made get new I/O started against them. If wbc->sync_mode is
819 * WB_SYNC_ALL then we were called for data integrity and we must wait for 856 * WB_SYNC_ALL then we were called for data integrity and we must wait for
820 * existing IO to complete. 857 * existing IO to complete.
858 *
859 * To avoid livelocks (when other process dirties new pages), we first tag
860 * pages which should be written back with TOWRITE tag and only then start
861 * writing them. For data-integrity sync we have to be careful so that we do
862 * not miss some pages (e.g., because some other process has cleared TOWRITE
863 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
864 * by the process clearing the DIRTY tag (and submitting the page for IO).
821 */ 865 */
822int write_cache_pages(struct address_space *mapping, 866int write_cache_pages(struct address_space *mapping,
823 struct writeback_control *wbc, writepage_t writepage, 867 struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +877,7 @@ int write_cache_pages(struct address_space *mapping,
833 pgoff_t done_index; 877 pgoff_t done_index;
834 int cycled; 878 int cycled;
835 int range_whole = 0; 879 int range_whole = 0;
880 int tag;
836 881
837 pagevec_init(&pvec, 0); 882 pagevec_init(&pvec, 0);
838 if (wbc->range_cyclic) { 883 if (wbc->range_cyclic) {
@@ -849,29 +894,19 @@ int write_cache_pages(struct address_space *mapping,
849 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 894 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
850 range_whole = 1; 895 range_whole = 1;
851 cycled = 1; /* ignore range_cyclic tests */ 896 cycled = 1; /* ignore range_cyclic tests */
852
853 /*
854 * If this is a data integrity sync, cap the writeback to the
855 * current end of file. Any extension to the file that occurs
856 * after this is a new write and we don't need to write those
857 * pages out to fulfil our data integrity requirements. If we
858 * try to write them out, we can get stuck in this scan until
859 * the concurrent writer stops adding dirty pages and extending
860 * EOF.
861 */
862 if (wbc->sync_mode == WB_SYNC_ALL &&
863 wbc->range_end == LLONG_MAX) {
864 end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
865 }
866 } 897 }
867 898 if (wbc->sync_mode == WB_SYNC_ALL)
899 tag = PAGECACHE_TAG_TOWRITE;
900 else
901 tag = PAGECACHE_TAG_DIRTY;
868retry: 902retry:
903 if (wbc->sync_mode == WB_SYNC_ALL)
904 tag_pages_for_writeback(mapping, index, end);
869 done_index = index; 905 done_index = index;
870 while (!done && (index <= end)) { 906 while (!done && (index <= end)) {
871 int i; 907 int i;
872 908
873 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 909 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
874 PAGECACHE_TAG_DIRTY,
875 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 910 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
876 if (nr_pages == 0) 911 if (nr_pages == 0)
877 break; 912 break;
@@ -929,6 +964,7 @@ continue_unlock:
929 if (!clear_page_dirty_for_io(page)) 964 if (!clear_page_dirty_for_io(page))
930 goto continue_unlock; 965 goto continue_unlock;
931 966
967 trace_wbc_writepage(wbc, mapping->backing_dev_info);
932 ret = (*writepage)(page, wbc, data); 968 ret = (*writepage)(page, wbc, data);
933 if (unlikely(ret)) { 969 if (unlikely(ret)) {
934 if (ret == AOP_WRITEPAGE_ACTIVATE) { 970 if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -1327,6 +1363,9 @@ int test_set_page_writeback(struct page *page)
1327 radix_tree_tag_clear(&mapping->page_tree, 1363 radix_tree_tag_clear(&mapping->page_tree,
1328 page_index(page), 1364 page_index(page),
1329 PAGECACHE_TAG_DIRTY); 1365 PAGECACHE_TAG_DIRTY);
1366 radix_tree_tag_clear(&mapping->page_tree,
1367 page_index(page),
1368 PAGECACHE_TAG_TOWRITE);
1330 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1369 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1331 } else { 1370 } else {
1332 ret = TestSetPageWriteback(page); 1371 ret = TestSetPageWriteback(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339eb04c6..a9649f4b261e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1738 struct page *page; 1738 struct page *page;
1739 1739
1740 /* Acquire the OOM killer lock for the zones in zonelist */ 1740 /* Acquire the OOM killer lock for the zones in zonelist */
1741 if (!try_set_zone_oom(zonelist, gfp_mask)) { 1741 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
1742 schedule_timeout_uninterruptible(1); 1742 schedule_timeout_uninterruptible(1);
1743 return NULL; 1743 return NULL;
1744 } 1744 }
@@ -1759,6 +1759,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1759 /* The OOM killer will not help higher order allocs */ 1759 /* The OOM killer will not help higher order allocs */
1760 if (order > PAGE_ALLOC_COSTLY_ORDER) 1760 if (order > PAGE_ALLOC_COSTLY_ORDER)
1761 goto out; 1761 goto out;
1762 /* The OOM killer does not needlessly kill tasks for lowmem */
1763 if (high_zoneidx < ZONE_NORMAL)
1764 goto out;
1762 /* 1765 /*
1763 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 1766 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1764 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 1767 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2052,15 +2055,23 @@ rebalance:
2052 if (page) 2055 if (page)
2053 goto got_pg; 2056 goto got_pg;
2054 2057
2055 /* 2058 if (!(gfp_mask & __GFP_NOFAIL)) {
2056 * The OOM killer does not trigger for high-order 2059 /*
2057 * ~__GFP_NOFAIL allocations so if no progress is being 2060 * The oom killer is not called for high-order
2058 * made, there are no other options and retrying is 2061 * allocations that may fail, so if no progress
2059 * unlikely to help. 2062 * is being made, there are no other options and
2060 */ 2063 * retrying is unlikely to help.
2061 if (order > PAGE_ALLOC_COSTLY_ORDER && 2064 */
2062 !(gfp_mask & __GFP_NOFAIL)) 2065 if (order > PAGE_ALLOC_COSTLY_ORDER)
2063 goto nopage; 2066 goto nopage;
2067 /*
2068 * The oom killer is not called for lowmem
2069 * allocations to prevent needlessly killing
2070 * innocent tasks.
2071 */
2072 if (high_zoneidx < ZONE_NORMAL)
2073 goto nopage;
2074 }
2064 2075
2065 goto restart; 2076 goto restart;
2066 } 2077 }
@@ -4089,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4089 zone_seqlock_init(zone); 4100 zone_seqlock_init(zone);
4090 zone->zone_pgdat = pgdat; 4101 zone->zone_pgdat = pgdat;
4091 4102
4092 zone->prev_priority = DEF_PRIORITY;
4093
4094 zone_pcp_init(zone); 4103 zone_pcp_init(zone);
4095 for_each_lru(l) { 4104 for_each_lru(l) {
4096 INIT_LIST_HEAD(&zone->lru[l].list); 4105 INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/page_io.c b/mm/page_io.c
index 31a3b962230a..2dee975bf469 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
106 goto out; 106 goto out;
107 } 107 }
108 if (wbc->sync_mode == WB_SYNC_ALL) 108 if (wbc->sync_mode == WB_SYNC_ALL)
109 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 109 rw |= REQ_SYNC | REQ_UNPLUG;
110 count_vm_event(PSWPOUT); 110 count_vm_event(PSWPOUT);
111 set_page_writeback(page); 111 set_page_writeback(page);
112 unlock_page(page); 112 unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..87b9e8ad4509 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/memcontrol.h> 56#include <linux/memcontrol.h>
57#include <linux/mmu_notifier.h> 57#include <linux/mmu_notifier.h>
58#include <linux/migrate.h> 58#include <linux/migrate.h>
59#include <linux/hugetlb.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -132,9 +133,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
132 if (unlikely(!anon_vma)) 133 if (unlikely(!anon_vma))
133 goto out_enomem_free_avc; 134 goto out_enomem_free_avc;
134 allocated = anon_vma; 135 allocated = anon_vma;
136 /*
137 * This VMA had no anon_vma yet. This anon_vma is
138 * the root of any anon_vma tree that might form.
139 */
140 anon_vma->root = anon_vma;
135 } 141 }
136 142
137 spin_lock(&anon_vma->lock); 143 anon_vma_lock(anon_vma);
138 /* page_table_lock to protect against threads */ 144 /* page_table_lock to protect against threads */
139 spin_lock(&mm->page_table_lock); 145 spin_lock(&mm->page_table_lock);
140 if (likely(!vma->anon_vma)) { 146 if (likely(!vma->anon_vma)) {
@@ -142,12 +148,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
142 avc->anon_vma = anon_vma; 148 avc->anon_vma = anon_vma;
143 avc->vma = vma; 149 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain); 150 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head); 151 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
146 allocated = NULL; 152 allocated = NULL;
147 avc = NULL; 153 avc = NULL;
148 } 154 }
149 spin_unlock(&mm->page_table_lock); 155 spin_unlock(&mm->page_table_lock);
150 spin_unlock(&anon_vma->lock); 156 anon_vma_unlock(anon_vma);
151 157
152 if (unlikely(allocated)) 158 if (unlikely(allocated))
153 anon_vma_free(allocated); 159 anon_vma_free(allocated);
@@ -170,9 +176,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
170 avc->anon_vma = anon_vma; 176 avc->anon_vma = anon_vma;
171 list_add(&avc->same_vma, &vma->anon_vma_chain); 177 list_add(&avc->same_vma, &vma->anon_vma_chain);
172 178
173 spin_lock(&anon_vma->lock); 179 anon_vma_lock(anon_vma);
174 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 180 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
175 spin_unlock(&anon_vma->lock); 181 anon_vma_unlock(anon_vma);
176} 182}
177 183
178/* 184/*
@@ -224,9 +230,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
224 avc = anon_vma_chain_alloc(); 230 avc = anon_vma_chain_alloc();
225 if (!avc) 231 if (!avc)
226 goto out_error_free_anon_vma; 232 goto out_error_free_anon_vma;
227 anon_vma_chain_link(vma, avc, anon_vma); 233
234 /*
235 * The root anon_vma's spinlock is the lock actually used when we
236 * lock any of the anon_vmas in this anon_vma tree.
237 */
238 anon_vma->root = pvma->anon_vma->root;
239 /*
240 * With KSM refcounts, an anon_vma can stay around longer than the
241 * process it belongs to. The root anon_vma needs to be pinned
242 * until this anon_vma is freed, because the lock lives in the root.
243 */
244 get_anon_vma(anon_vma->root);
228 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 245 /* Mark this anon_vma as the one where our new (COWed) pages go. */
229 vma->anon_vma = anon_vma; 246 vma->anon_vma = anon_vma;
247 anon_vma_chain_link(vma, avc, anon_vma);
230 248
231 return 0; 249 return 0;
232 250
@@ -246,22 +264,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
246 if (!anon_vma) 264 if (!anon_vma)
247 return; 265 return;
248 266
249 spin_lock(&anon_vma->lock); 267 anon_vma_lock(anon_vma);
250 list_del(&anon_vma_chain->same_anon_vma); 268 list_del(&anon_vma_chain->same_anon_vma);
251 269
252 /* We must garbage collect the anon_vma if it's empty */ 270 /* We must garbage collect the anon_vma if it's empty */
253 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 271 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
254 spin_unlock(&anon_vma->lock); 272 anon_vma_unlock(anon_vma);
255 273
256 if (empty) 274 if (empty) {
275 /* We no longer need the root anon_vma */
276 if (anon_vma->root != anon_vma)
277 drop_anon_vma(anon_vma->root);
257 anon_vma_free(anon_vma); 278 anon_vma_free(anon_vma);
279 }
258} 280}
259 281
260void unlink_anon_vmas(struct vm_area_struct *vma) 282void unlink_anon_vmas(struct vm_area_struct *vma)
261{ 283{
262 struct anon_vma_chain *avc, *next; 284 struct anon_vma_chain *avc, *next;
263 285
264 /* Unlink each anon_vma chained to the VMA. */ 286 /*
287 * Unlink each anon_vma chained to the VMA. This list is ordered
288 * from newest to oldest, ensuring the root anon_vma gets freed last.
289 */
265 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 290 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
266 anon_vma_unlink(avc); 291 anon_vma_unlink(avc);
267 list_del(&avc->same_vma); 292 list_del(&avc->same_vma);
@@ -302,7 +327,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
302 goto out; 327 goto out;
303 328
304 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 329 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
305 spin_lock(&anon_vma->lock); 330 anon_vma_lock(anon_vma);
306 return anon_vma; 331 return anon_vma;
307out: 332out:
308 rcu_read_unlock(); 333 rcu_read_unlock();
@@ -311,7 +336,7 @@ out:
311 336
312void page_unlock_anon_vma(struct anon_vma *anon_vma) 337void page_unlock_anon_vma(struct anon_vma *anon_vma)
313{ 338{
314 spin_unlock(&anon_vma->lock); 339 anon_vma_unlock(anon_vma);
315 rcu_read_unlock(); 340 rcu_read_unlock();
316} 341}
317 342
@@ -326,6 +351,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
326 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 351 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
327 unsigned long address; 352 unsigned long address;
328 353
354 if (unlikely(is_vm_hugetlb_page(vma)))
355 pgoff = page->index << huge_page_order(page_hstate(page));
329 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 356 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
330 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 357 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
331 /* page should be within @vma mapping range */ 358 /* page should be within @vma mapping range */
@@ -340,9 +367,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)
340 */ 367 */
341unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 368unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
342{ 369{
343 if (PageAnon(page)) 370 if (PageAnon(page)) {
344 ; 371 if (vma->anon_vma->root != page_anon_vma(page)->root)
345 else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 372 return -EFAULT;
373 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
346 if (!vma->vm_file || 374 if (!vma->vm_file ||
347 vma->vm_file->f_mapping != page->mapping) 375 vma->vm_file->f_mapping != page->mapping)
348 return -EFAULT; 376 return -EFAULT;
@@ -369,6 +397,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
369 pte_t *pte; 397 pte_t *pte;
370 spinlock_t *ptl; 398 spinlock_t *ptl;
371 399
400 if (unlikely(PageHuge(page))) {
401 pte = huge_pte_offset(mm, address);
402 ptl = &mm->page_table_lock;
403 goto check;
404 }
405
372 pgd = pgd_offset(mm, address); 406 pgd = pgd_offset(mm, address);
373 if (!pgd_present(*pgd)) 407 if (!pgd_present(*pgd))
374 return NULL; 408 return NULL;
@@ -389,6 +423,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
389 } 423 }
390 424
391 ptl = pte_lockptr(mm, pmd); 425 ptl = pte_lockptr(mm, pmd);
426check:
392 spin_lock(ptl); 427 spin_lock(ptl);
393 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 428 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
394 *ptlp = ptl; 429 *ptlp = ptl;
@@ -743,14 +778,20 @@ static void __page_set_anon_rmap(struct page *page,
743 * If the page isn't exclusively mapped into this vma, 778 * If the page isn't exclusively mapped into this vma,
744 * we must use the _oldest_ possible anon_vma for the 779 * we must use the _oldest_ possible anon_vma for the
745 * page mapping! 780 * page mapping!
746 *
747 * So take the last AVC chain entry in the vma, which is
748 * the deepest ancestor, and use the anon_vma from that.
749 */ 781 */
750 if (!exclusive) { 782 if (!exclusive) {
751 struct anon_vma_chain *avc; 783 if (PageAnon(page))
752 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); 784 return;
753 anon_vma = avc->anon_vma; 785 anon_vma = anon_vma->root;
786 } else {
787 /*
788 * In this case, swapped-out-but-not-discarded swap-cache
789 * is remapped. So, no need to update page->mapping here.
790 * We convice anon_vma poitned by page->mapping is not obsolete
791 * because vma->anon_vma is necessary to be a family of it.
792 */
793 if (PageAnon(page))
794 return;
754 } 795 }
755 796
756 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 797 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -780,6 +821,7 @@ static void __page_check_anon_rmap(struct page *page,
780 * are initially only visible via the pagetables, and the pte is locked 821 * are initially only visible via the pagetables, and the pte is locked
781 * over the call to page_add_new_anon_rmap. 822 * over the call to page_add_new_anon_rmap.
782 */ 823 */
824 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
783 BUG_ON(page->index != linear_page_index(vma, address)); 825 BUG_ON(page->index != linear_page_index(vma, address));
784#endif 826#endif
785} 827}
@@ -798,6 +840,17 @@ static void __page_check_anon_rmap(struct page *page,
798void page_add_anon_rmap(struct page *page, 840void page_add_anon_rmap(struct page *page,
799 struct vm_area_struct *vma, unsigned long address) 841 struct vm_area_struct *vma, unsigned long address)
800{ 842{
843 do_page_add_anon_rmap(page, vma, address, 0);
844}
845
846/*
847 * Special version of the above for do_swap_page, which often runs
848 * into pages that are exclusively owned by the current process.
849 * Everybody else should continue to use page_add_anon_rmap above.
850 */
851void do_page_add_anon_rmap(struct page *page,
852 struct vm_area_struct *vma, unsigned long address, int exclusive)
853{
801 int first = atomic_inc_and_test(&page->_mapcount); 854 int first = atomic_inc_and_test(&page->_mapcount);
802 if (first) 855 if (first)
803 __inc_zone_page_state(page, NR_ANON_PAGES); 856 __inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +860,7 @@ void page_add_anon_rmap(struct page *page,
807 VM_BUG_ON(!PageLocked(page)); 860 VM_BUG_ON(!PageLocked(page));
808 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 861 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
809 if (first) 862 if (first)
810 __page_set_anon_rmap(page, vma, address, 0); 863 __page_set_anon_rmap(page, vma, address, exclusive);
811 else 864 else
812 __page_check_anon_rmap(page, vma, address); 865 __page_check_anon_rmap(page, vma, address);
813} 866}
@@ -873,6 +926,12 @@ void page_remove_rmap(struct page *page)
873 page_clear_dirty(page); 926 page_clear_dirty(page);
874 set_page_dirty(page); 927 set_page_dirty(page);
875 } 928 }
929 /*
930 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
931 * and not charged by memcg for now.
932 */
933 if (unlikely(PageHuge(page)))
934 return;
876 if (PageAnon(page)) { 935 if (PageAnon(page)) {
877 mem_cgroup_uncharge_page(page); 936 mem_cgroup_uncharge_page(page);
878 __dec_zone_page_state(page, NR_ANON_PAGES); 937 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1368,6 +1427,42 @@ int try_to_munlock(struct page *page)
1368 return try_to_unmap_file(page, TTU_MUNLOCK); 1427 return try_to_unmap_file(page, TTU_MUNLOCK);
1369} 1428}
1370 1429
1430#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
1431/*
1432 * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
1433 * if necessary. Be careful to do all the tests under the lock. Once
1434 * we know we are the last user, nobody else can get a reference and we
1435 * can do the freeing without the lock.
1436 */
1437void drop_anon_vma(struct anon_vma *anon_vma)
1438{
1439 BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
1440 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
1441 struct anon_vma *root = anon_vma->root;
1442 int empty = list_empty(&anon_vma->head);
1443 int last_root_user = 0;
1444 int root_empty = 0;
1445
1446 /*
1447 * The refcount on a non-root anon_vma got dropped. Drop
1448 * the refcount on the root and check if we need to free it.
1449 */
1450 if (empty && anon_vma != root) {
1451 BUG_ON(atomic_read(&root->external_refcount) <= 0);
1452 last_root_user = atomic_dec_and_test(&root->external_refcount);
1453 root_empty = list_empty(&root->head);
1454 }
1455 anon_vma_unlock(anon_vma);
1456
1457 if (empty) {
1458 anon_vma_free(anon_vma);
1459 if (root_empty && last_root_user)
1460 anon_vma_free(root);
1461 }
1462 }
1463}
1464#endif
1465
1371#ifdef CONFIG_MIGRATION 1466#ifdef CONFIG_MIGRATION
1372/* 1467/*
1373 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1468 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1389,7 +1484,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1389 anon_vma = page_anon_vma(page); 1484 anon_vma = page_anon_vma(page);
1390 if (!anon_vma) 1485 if (!anon_vma)
1391 return ret; 1486 return ret;
1392 spin_lock(&anon_vma->lock); 1487 anon_vma_lock(anon_vma);
1393 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1488 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1394 struct vm_area_struct *vma = avc->vma; 1489 struct vm_area_struct *vma = avc->vma;
1395 unsigned long address = vma_address(page, vma); 1490 unsigned long address = vma_address(page, vma);
@@ -1399,7 +1494,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1399 if (ret != SWAP_AGAIN) 1494 if (ret != SWAP_AGAIN)
1400 break; 1495 break;
1401 } 1496 }
1402 spin_unlock(&anon_vma->lock); 1497 anon_vma_unlock(anon_vma);
1403 return ret; 1498 return ret;
1404} 1499}
1405 1500
@@ -1445,3 +1540,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1445 return rmap_walk_file(page, rmap_one, arg); 1540 return rmap_walk_file(page, rmap_one, arg);
1446} 1541}
1447#endif /* CONFIG_MIGRATION */ 1542#endif /* CONFIG_MIGRATION */
1543
1544#ifdef CONFIG_HUGETLB_PAGE
1545/*
1546 * The following three functions are for anonymous (private mapped) hugepages.
1547 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1548 * and no lru code, because we handle hugepages differently from common pages.
1549 */
1550static void __hugepage_set_anon_rmap(struct page *page,
1551 struct vm_area_struct *vma, unsigned long address, int exclusive)
1552{
1553 struct anon_vma *anon_vma = vma->anon_vma;
1554 BUG_ON(!anon_vma);
1555 if (!exclusive) {
1556 struct anon_vma_chain *avc;
1557 avc = list_entry(vma->anon_vma_chain.prev,
1558 struct anon_vma_chain, same_vma);
1559 anon_vma = avc->anon_vma;
1560 }
1561 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1562 page->mapping = (struct address_space *) anon_vma;
1563 page->index = linear_page_index(vma, address);
1564}
1565
1566void hugepage_add_anon_rmap(struct page *page,
1567 struct vm_area_struct *vma, unsigned long address)
1568{
1569 struct anon_vma *anon_vma = vma->anon_vma;
1570 int first;
1571 BUG_ON(!anon_vma);
1572 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1573 first = atomic_inc_and_test(&page->_mapcount);
1574 if (first)
1575 __hugepage_set_anon_rmap(page, vma, address, 0);
1576}
1577
1578void hugepage_add_new_anon_rmap(struct page *page,
1579 struct vm_area_struct *vma, unsigned long address)
1580{
1581 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1582 atomic_set(&page->_mapcount, 0);
1583 __hugepage_set_anon_rmap(page, vma, address, 1);
1584}
1585#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..080b09a57a8f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
28#include <linux/file.h> 28#include <linux/file.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/percpu_counter.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
32 33
33static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
233{ 234{
234 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 235 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
235 if (sbinfo->max_blocks) { 236 if (sbinfo->max_blocks) {
236 spin_lock(&sbinfo->stat_lock); 237 percpu_counter_add(&sbinfo->used_blocks, -pages);
237 sbinfo->free_blocks += pages; 238 spin_lock(&inode->i_lock);
238 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 239 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
239 spin_unlock(&sbinfo->stat_lock); 240 spin_unlock(&inode->i_lock);
240 } 241 }
241} 242}
242 243
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
416 if (sgp == SGP_READ) 417 if (sgp == SGP_READ)
417 return shmem_swp_map(ZERO_PAGE(0)); 418 return shmem_swp_map(ZERO_PAGE(0));
418 /* 419 /*
419 * Test free_blocks against 1 not 0, since we have 1 data 420 * Test used_blocks against 1 less max_blocks, since we have 1 data
420 * page (and perhaps indirect index pages) yet to allocate: 421 * page (and perhaps indirect index pages) yet to allocate:
421 * a waste to allocate index if we cannot allocate data. 422 * a waste to allocate index if we cannot allocate data.
422 */ 423 */
423 if (sbinfo->max_blocks) { 424 if (sbinfo->max_blocks) {
424 spin_lock(&sbinfo->stat_lock); 425 if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
425 if (sbinfo->free_blocks <= 1) {
426 spin_unlock(&sbinfo->stat_lock);
427 return ERR_PTR(-ENOSPC); 426 return ERR_PTR(-ENOSPC);
428 } 427 percpu_counter_inc(&sbinfo->used_blocks);
429 sbinfo->free_blocks--; 428 spin_lock(&inode->i_lock);
430 inode->i_blocks += BLOCKS_PER_PAGE; 429 inode->i_blocks += BLOCKS_PER_PAGE;
431 spin_unlock(&sbinfo->stat_lock); 430 spin_unlock(&inode->i_lock);
432 } 431 }
433 432
434 spin_unlock(&info->lock); 433 spin_unlock(&info->lock);
@@ -767,6 +766,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
767 loff_t newsize = attr->ia_size; 766 loff_t newsize = attr->ia_size;
768 int error; 767 int error;
769 768
769 error = inode_change_ok(inode, attr);
770 if (error)
771 return error;
772
770 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) 773 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
771 && newsize != inode->i_size) { 774 && newsize != inode->i_size) {
772 struct page *page = NULL; 775 struct page *page = NULL;
@@ -801,25 +804,22 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
801 } 804 }
802 } 805 }
803 806
804 error = simple_setsize(inode, newsize); 807 /* XXX(truncate): truncate_setsize should be called last */
808 truncate_setsize(inode, newsize);
805 if (page) 809 if (page)
806 page_cache_release(page); 810 page_cache_release(page);
807 if (error)
808 return error;
809 shmem_truncate_range(inode, newsize, (loff_t)-1); 811 shmem_truncate_range(inode, newsize, (loff_t)-1);
810 } 812 }
811 813
812 error = inode_change_ok(inode, attr); 814 setattr_copy(inode, attr);
813 if (!error)
814 generic_setattr(inode, attr);
815#ifdef CONFIG_TMPFS_POSIX_ACL 815#ifdef CONFIG_TMPFS_POSIX_ACL
816 if (!error && (attr->ia_valid & ATTR_MODE)) 816 if (attr->ia_valid & ATTR_MODE)
817 error = generic_acl_chmod(inode); 817 error = generic_acl_chmod(inode);
818#endif 818#endif
819 return error; 819 return error;
820} 820}
821 821
822static void shmem_delete_inode(struct inode *inode) 822static void shmem_evict_inode(struct inode *inode)
823{ 823{
824 struct shmem_inode_info *info = SHMEM_I(inode); 824 struct shmem_inode_info *info = SHMEM_I(inode);
825 825
@@ -836,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode)
836 } 836 }
837 BUG_ON(inode->i_blocks); 837 BUG_ON(inode->i_blocks);
838 shmem_free_inode(inode->i_sb); 838 shmem_free_inode(inode->i_sb);
839 clear_inode(inode); 839 end_writeback(inode);
840} 840}
841 841
842static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 842static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
@@ -933,7 +933,7 @@ found:
933 933
934 /* 934 /*
935 * Move _head_ to start search for next from here. 935 * Move _head_ to start search for next from here.
936 * But be careful: shmem_delete_inode checks list_empty without taking 936 * But be careful: shmem_evict_inode checks list_empty without taking
937 * mutex, and there's an instant in list_move_tail when info->swaplist 937 * mutex, and there's an instant in list_move_tail when info->swaplist
938 * would appear empty, if it were the only one on shmem_swaplist. We 938 * would appear empty, if it were the only one on shmem_swaplist. We
939 * could avoid doing it if inode NULL; or use this minor optimization. 939 * could avoid doing it if inode NULL; or use this minor optimization.
@@ -1223,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1223 struct shmem_sb_info *sbinfo; 1223 struct shmem_sb_info *sbinfo;
1224 struct page *filepage = *pagep; 1224 struct page *filepage = *pagep;
1225 struct page *swappage; 1225 struct page *swappage;
1226 struct page *prealloc_page = NULL;
1226 swp_entry_t *entry; 1227 swp_entry_t *entry;
1227 swp_entry_t swap; 1228 swp_entry_t swap;
1228 gfp_t gfp; 1229 gfp_t gfp;
@@ -1247,7 +1248,6 @@ repeat:
1247 filepage = find_lock_page(mapping, idx); 1248 filepage = find_lock_page(mapping, idx);
1248 if (filepage && PageUptodate(filepage)) 1249 if (filepage && PageUptodate(filepage))
1249 goto done; 1250 goto done;
1250 error = 0;
1251 gfp = mapping_gfp_mask(mapping); 1251 gfp = mapping_gfp_mask(mapping);
1252 if (!filepage) { 1252 if (!filepage) {
1253 /* 1253 /*
@@ -1258,7 +1258,19 @@ repeat:
1258 if (error) 1258 if (error)
1259 goto failed; 1259 goto failed;
1260 radix_tree_preload_end(); 1260 radix_tree_preload_end();
1261 if (sgp != SGP_READ && !prealloc_page) {
1262 /* We don't care if this fails */
1263 prealloc_page = shmem_alloc_page(gfp, info, idx);
1264 if (prealloc_page) {
1265 if (mem_cgroup_cache_charge(prealloc_page,
1266 current->mm, GFP_KERNEL)) {
1267 page_cache_release(prealloc_page);
1268 prealloc_page = NULL;
1269 }
1270 }
1271 }
1261 } 1272 }
1273 error = 0;
1262 1274
1263 spin_lock(&info->lock); 1275 spin_lock(&info->lock);
1264 shmem_recalc_inode(inode); 1276 shmem_recalc_inode(inode);
@@ -1387,17 +1399,16 @@ repeat:
1387 shmem_swp_unmap(entry); 1399 shmem_swp_unmap(entry);
1388 sbinfo = SHMEM_SB(inode->i_sb); 1400 sbinfo = SHMEM_SB(inode->i_sb);
1389 if (sbinfo->max_blocks) { 1401 if (sbinfo->max_blocks) {
1390 spin_lock(&sbinfo->stat_lock); 1402 if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
1391 if (sbinfo->free_blocks == 0 ||
1392 shmem_acct_block(info->flags)) { 1403 shmem_acct_block(info->flags)) {
1393 spin_unlock(&sbinfo->stat_lock);
1394 spin_unlock(&info->lock); 1404 spin_unlock(&info->lock);
1395 error = -ENOSPC; 1405 error = -ENOSPC;
1396 goto failed; 1406 goto failed;
1397 } 1407 }
1398 sbinfo->free_blocks--; 1408 percpu_counter_inc(&sbinfo->used_blocks);
1409 spin_lock(&inode->i_lock);
1399 inode->i_blocks += BLOCKS_PER_PAGE; 1410 inode->i_blocks += BLOCKS_PER_PAGE;
1400 spin_unlock(&sbinfo->stat_lock); 1411 spin_unlock(&inode->i_lock);
1401 } else if (shmem_acct_block(info->flags)) { 1412 } else if (shmem_acct_block(info->flags)) {
1402 spin_unlock(&info->lock); 1413 spin_unlock(&info->lock);
1403 error = -ENOSPC; 1414 error = -ENOSPC;
@@ -1407,28 +1418,38 @@ repeat:
1407 if (!filepage) { 1418 if (!filepage) {
1408 int ret; 1419 int ret;
1409 1420
1410 spin_unlock(&info->lock); 1421 if (!prealloc_page) {
1411 filepage = shmem_alloc_page(gfp, info, idx); 1422 spin_unlock(&info->lock);
1412 if (!filepage) { 1423 filepage = shmem_alloc_page(gfp, info, idx);
1413 shmem_unacct_blocks(info->flags, 1); 1424 if (!filepage) {
1414 shmem_free_blocks(inode, 1); 1425 shmem_unacct_blocks(info->flags, 1);
1415 error = -ENOMEM; 1426 shmem_free_blocks(inode, 1);
1416 goto failed; 1427 error = -ENOMEM;
1417 } 1428 goto failed;
1418 SetPageSwapBacked(filepage); 1429 }
1430 SetPageSwapBacked(filepage);
1419 1431
1420 /* Precharge page while we can wait, compensate after */ 1432 /*
1421 error = mem_cgroup_cache_charge(filepage, current->mm, 1433 * Precharge page while we can wait, compensate
1422 GFP_KERNEL); 1434 * after
1423 if (error) { 1435 */
1424 page_cache_release(filepage); 1436 error = mem_cgroup_cache_charge(filepage,
1425 shmem_unacct_blocks(info->flags, 1); 1437 current->mm, GFP_KERNEL);
1426 shmem_free_blocks(inode, 1); 1438 if (error) {
1427 filepage = NULL; 1439 page_cache_release(filepage);
1428 goto failed; 1440 shmem_unacct_blocks(info->flags, 1);
1441 shmem_free_blocks(inode, 1);
1442 filepage = NULL;
1443 goto failed;
1444 }
1445
1446 spin_lock(&info->lock);
1447 } else {
1448 filepage = prealloc_page;
1449 prealloc_page = NULL;
1450 SetPageSwapBacked(filepage);
1429 } 1451 }
1430 1452
1431 spin_lock(&info->lock);
1432 entry = shmem_swp_alloc(info, idx, sgp); 1453 entry = shmem_swp_alloc(info, idx, sgp);
1433 if (IS_ERR(entry)) 1454 if (IS_ERR(entry))
1434 error = PTR_ERR(entry); 1455 error = PTR_ERR(entry);
@@ -1469,13 +1490,19 @@ repeat:
1469 } 1490 }
1470done: 1491done:
1471 *pagep = filepage; 1492 *pagep = filepage;
1472 return 0; 1493 error = 0;
1494 goto out;
1473 1495
1474failed: 1496failed:
1475 if (*pagep != filepage) { 1497 if (*pagep != filepage) {
1476 unlock_page(filepage); 1498 unlock_page(filepage);
1477 page_cache_release(filepage); 1499 page_cache_release(filepage);
1478 } 1500 }
1501out:
1502 if (prealloc_page) {
1503 mem_cgroup_uncharge_cache_page(prealloc_page);
1504 page_cache_release(prealloc_page);
1505 }
1479 return error; 1506 return error;
1480} 1507}
1481 1508
@@ -1791,17 +1818,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1791 buf->f_type = TMPFS_MAGIC; 1818 buf->f_type = TMPFS_MAGIC;
1792 buf->f_bsize = PAGE_CACHE_SIZE; 1819 buf->f_bsize = PAGE_CACHE_SIZE;
1793 buf->f_namelen = NAME_MAX; 1820 buf->f_namelen = NAME_MAX;
1794 spin_lock(&sbinfo->stat_lock);
1795 if (sbinfo->max_blocks) { 1821 if (sbinfo->max_blocks) {
1796 buf->f_blocks = sbinfo->max_blocks; 1822 buf->f_blocks = sbinfo->max_blocks;
1797 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 1823 buf->f_bavail = buf->f_bfree =
1824 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
1798 } 1825 }
1799 if (sbinfo->max_inodes) { 1826 if (sbinfo->max_inodes) {
1800 buf->f_files = sbinfo->max_inodes; 1827 buf->f_files = sbinfo->max_inodes;
1801 buf->f_ffree = sbinfo->free_inodes; 1828 buf->f_ffree = sbinfo->free_inodes;
1802 } 1829 }
1803 /* else leave those fields 0 like simple_statfs */ 1830 /* else leave those fields 0 like simple_statfs */
1804 spin_unlock(&sbinfo->stat_lock);
1805 return 0; 1831 return 0;
1806} 1832}
1807 1833
@@ -2242,7 +2268,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2242{ 2268{
2243 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2269 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2244 struct shmem_sb_info config = *sbinfo; 2270 struct shmem_sb_info config = *sbinfo;
2245 unsigned long blocks;
2246 unsigned long inodes; 2271 unsigned long inodes;
2247 int error = -EINVAL; 2272 int error = -EINVAL;
2248 2273
@@ -2250,9 +2275,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2250 return error; 2275 return error;
2251 2276
2252 spin_lock(&sbinfo->stat_lock); 2277 spin_lock(&sbinfo->stat_lock);
2253 blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2254 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 2278 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2255 if (config.max_blocks < blocks) 2279 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2256 goto out; 2280 goto out;
2257 if (config.max_inodes < inodes) 2281 if (config.max_inodes < inodes)
2258 goto out; 2282 goto out;
@@ -2269,7 +2293,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2269 2293
2270 error = 0; 2294 error = 0;
2271 sbinfo->max_blocks = config.max_blocks; 2295 sbinfo->max_blocks = config.max_blocks;
2272 sbinfo->free_blocks = config.max_blocks - blocks;
2273 sbinfo->max_inodes = config.max_inodes; 2296 sbinfo->max_inodes = config.max_inodes;
2274 sbinfo->free_inodes = config.max_inodes - inodes; 2297 sbinfo->free_inodes = config.max_inodes - inodes;
2275 2298
@@ -2302,7 +2325,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
2302 2325
2303static void shmem_put_super(struct super_block *sb) 2326static void shmem_put_super(struct super_block *sb)
2304{ 2327{
2305 kfree(sb->s_fs_info); 2328 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2329
2330 percpu_counter_destroy(&sbinfo->used_blocks);
2331 kfree(sbinfo);
2306 sb->s_fs_info = NULL; 2332 sb->s_fs_info = NULL;
2307} 2333}
2308 2334
@@ -2344,7 +2370,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2344#endif 2370#endif
2345 2371
2346 spin_lock_init(&sbinfo->stat_lock); 2372 spin_lock_init(&sbinfo->stat_lock);
2347 sbinfo->free_blocks = sbinfo->max_blocks; 2373 if (percpu_counter_init(&sbinfo->used_blocks, 0))
2374 goto failed;
2348 sbinfo->free_inodes = sbinfo->max_inodes; 2375 sbinfo->free_inodes = sbinfo->max_inodes;
2349 2376
2350 sb->s_maxbytes = SHMEM_MAX_BYTES; 2377 sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2496,7 +2523,7 @@ static const struct super_operations shmem_ops = {
2496 .remount_fs = shmem_remount_fs, 2523 .remount_fs = shmem_remount_fs,
2497 .show_options = shmem_show_options, 2524 .show_options = shmem_show_options,
2498#endif 2525#endif
2499 .delete_inode = shmem_delete_inode, 2526 .evict_inode = shmem_evict_inode,
2500 .drop_inode = generic_delete_inode, 2527 .drop_inode = generic_delete_inode,
2501 .put_super = shmem_put_super, 2528 .put_super = shmem_put_super,
2502}; 2529};
diff --git a/mm/slab.c b/mm/slab.c
index dd41b74c8322..fcae9815d3b3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -394,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
394#define STATS_DEC_ACTIVE(x) do { } while (0) 394#define STATS_DEC_ACTIVE(x) do { } while (0)
395#define STATS_INC_ALLOCED(x) do { } while (0) 395#define STATS_INC_ALLOCED(x) do { } while (0)
396#define STATS_INC_GROWN(x) do { } while (0) 396#define STATS_INC_GROWN(x) do { } while (0)
397#define STATS_ADD_REAPED(x,y) do { } while (0) 397#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
398#define STATS_SET_HIGH(x) do { } while (0) 398#define STATS_SET_HIGH(x) do { } while (0)
399#define STATS_INC_ERR(x) do { } while (0) 399#define STATS_INC_ERR(x) do { } while (0)
400#define STATS_INC_NODEALLOCS(x) do { } while (0) 400#define STATS_INC_NODEALLOCS(x) do { } while (0)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 03aa2d55f1a2..1f3f9c59a73a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
47long total_swap_pages; 47long total_swap_pages;
48static int least_priority; 48static int least_priority;
49 49
50static bool swap_for_hibernation;
51
50static const char Bad_file[] = "Bad swap file entry "; 52static const char Bad_file[] = "Bad swap file entry ";
51static const char Unused_file[] = "Unused swap file entry "; 53static const char Unused_file[] = "Unused swap file entry ";
52static const char Bad_offset[] = "Bad swap offset entry "; 54static const char Bad_offset[] = "Bad swap offset entry ";
@@ -318,8 +320,10 @@ checks:
318 if (offset > si->highest_bit) 320 if (offset > si->highest_bit)
319 scan_base = offset = si->lowest_bit; 321 scan_base = offset = si->lowest_bit;
320 322
321 /* reuse swap entry of cache-only swap if not busy. */ 323 /* reuse swap entry of cache-only swap if not hibernation. */
322 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 324 if (vm_swap_full()
325 && usage == SWAP_HAS_CACHE
326 && si->swap_map[offset] == SWAP_HAS_CACHE) {
323 int swap_was_freed; 327 int swap_was_freed;
324 spin_unlock(&swap_lock); 328 spin_unlock(&swap_lock);
325 swap_was_freed = __try_to_reclaim_swap(si, offset); 329 swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -449,6 +453,8 @@ swp_entry_t get_swap_page(void)
449 spin_lock(&swap_lock); 453 spin_lock(&swap_lock);
450 if (nr_swap_pages <= 0) 454 if (nr_swap_pages <= 0)
451 goto noswap; 455 goto noswap;
456 if (swap_for_hibernation)
457 goto noswap;
452 nr_swap_pages--; 458 nr_swap_pages--;
453 459
454 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 460 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -481,28 +487,6 @@ noswap:
481 return (swp_entry_t) {0}; 487 return (swp_entry_t) {0};
482} 488}
483 489
484/* The only caller of this function is now susupend routine */
485swp_entry_t get_swap_page_of_type(int type)
486{
487 struct swap_info_struct *si;
488 pgoff_t offset;
489
490 spin_lock(&swap_lock);
491 si = swap_info[type];
492 if (si && (si->flags & SWP_WRITEOK)) {
493 nr_swap_pages--;
494 /* This is called for allocating swap entry, not cache */
495 offset = scan_swap_map(si, 1);
496 if (offset) {
497 spin_unlock(&swap_lock);
498 return swp_entry(type, offset);
499 }
500 nr_swap_pages++;
501 }
502 spin_unlock(&swap_lock);
503 return (swp_entry_t) {0};
504}
505
506static struct swap_info_struct *swap_info_get(swp_entry_t entry) 490static struct swap_info_struct *swap_info_get(swp_entry_t entry)
507{ 491{
508 struct swap_info_struct *p; 492 struct swap_info_struct *p;
@@ -762,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
762#endif 746#endif
763 747
764#ifdef CONFIG_HIBERNATION 748#ifdef CONFIG_HIBERNATION
749
750static pgoff_t hibernation_offset[MAX_SWAPFILES];
751/*
752 * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
753 * saved swap_map[] image to the disk will be an incomplete because it's
754 * changing without synchronization with hibernation snap shot.
755 * At resume, we just make swap_for_hibernation=false. We can forget
756 * used maps easily.
757 */
758void hibernation_freeze_swap(void)
759{
760 int i;
761
762 spin_lock(&swap_lock);
763
764 printk(KERN_INFO "PM: Freeze Swap\n");
765 swap_for_hibernation = true;
766 for (i = 0; i < MAX_SWAPFILES; i++)
767 hibernation_offset[i] = 1;
768 spin_unlock(&swap_lock);
769}
770
771void hibernation_thaw_swap(void)
772{
773 spin_lock(&swap_lock);
774 if (swap_for_hibernation) {
775 printk(KERN_INFO "PM: Thaw Swap\n");
776 swap_for_hibernation = false;
777 }
778 spin_unlock(&swap_lock);
779}
780
781/*
782 * Because updateing swap_map[] can make not-saved-status-change,
783 * we use our own easy allocator.
784 * Please see kernel/power/swap.c, Used swaps are recorded into
785 * RB-tree.
786 */
787swp_entry_t get_swap_for_hibernation(int type)
788{
789 pgoff_t off;
790 swp_entry_t val = {0};
791 struct swap_info_struct *si;
792
793 spin_lock(&swap_lock);
794
795 si = swap_info[type];
796 if (!si || !(si->flags & SWP_WRITEOK))
797 goto done;
798
799 for (off = hibernation_offset[type]; off < si->max; ++off) {
800 if (!si->swap_map[off])
801 break;
802 }
803 if (off < si->max) {
804 val = swp_entry(type, off);
805 hibernation_offset[type] = off + 1;
806 }
807done:
808 spin_unlock(&swap_lock);
809 return val;
810}
811
812void swap_free_for_hibernation(swp_entry_t ent)
813{
814 /* Nothing to do */
815}
816
765/* 817/*
766 * Find the swap type that corresponds to given device (if any). 818 * Find the swap type that corresponds to given device (if any).
767 * 819 *
diff --git a/mm/truncate.c b/mm/truncate.c
index 937571b8b233..ba887bff48c5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -541,28 +541,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
541EXPORT_SYMBOL(truncate_pagecache); 541EXPORT_SYMBOL(truncate_pagecache);
542 542
543/** 543/**
544 * truncate_setsize - update inode and pagecache for a new file size
545 * @inode: inode
546 * @newsize: new file size
547 *
548 * truncate_setsize updastes i_size update and performs pagecache
549 * truncation (if necessary) for a file size updates. It will be
550 * typically be called from the filesystem's setattr function when
551 * ATTR_SIZE is passed in.
552 *
553 * Must be called with inode_mutex held and after all filesystem
554 * specific block truncation has been performed.
555 */
556void truncate_setsize(struct inode *inode, loff_t newsize)
557{
558 loff_t oldsize;
559
560 oldsize = inode->i_size;
561 i_size_write(inode, newsize);
562
563 truncate_pagecache(inode, oldsize, newsize);
564}
565EXPORT_SYMBOL(truncate_setsize);
566
567/**
544 * vmtruncate - unmap mappings "freed" by truncate() syscall 568 * vmtruncate - unmap mappings "freed" by truncate() syscall
545 * @inode: inode of the file used 569 * @inode: inode of the file used
546 * @offset: file offset to start truncating 570 * @offset: file offset to start truncating
547 * 571 *
548 * NOTE! We have to be ready to update the memory sharing 572 * This function is deprecated and truncate_setsize or truncate_pagecache
549 * between the file and the memory map for a potential last 573 * should be used instead, together with filesystem specific block truncation.
550 * incomplete page. Ugly, but necessary.
551 *
552 * This function is deprecated and simple_setsize or truncate_pagecache
553 * should be used instead.
554 */ 574 */
555int vmtruncate(struct inode *inode, loff_t offset) 575int vmtruncate(struct inode *inode, loff_t offset)
556{ 576{
557 int error; 577 int error;
558 578
559 error = simple_setsize(inode, offset); 579 error = inode_newsize_ok(inode, offset);
560 if (error) 580 if (error)
561 return error; 581 return error;
562 582
583 truncate_setsize(inode, offset);
563 if (inode->i_op->truncate) 584 if (inode->i_op->truncate)
564 inode->i_op->truncate(inode); 585 inode->i_op->truncate(inode);
565 586 return 0;
566 return error;
567} 587}
568EXPORT_SYMBOL(vmtruncate); 588EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index f5712e8964be..4735ea481816 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n)
225 if (length > n) 225 if (length > n)
226 return ERR_PTR(-EINVAL); 226 return ERR_PTR(-EINVAL);
227 227
228 p = kmalloc(length, GFP_KERNEL); 228 p = memdup_user(s, length);
229 229
230 if (!p) 230 if (IS_ERR(p))
231 return ERR_PTR(-ENOMEM); 231 return p;
232
233 if (copy_from_user(p, s, length)) {
234 kfree(p);
235 return ERR_PTR(-EFAULT);
236 }
237 232
238 p[length - 1] = '\0'; 233 p[length - 1] = '\0';
239 234
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b7e314b1009f..6b8889da69a6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
33 33
34bool vmap_lazy_unmap __read_mostly = true;
34 35
35/*** Page table manipulation functions ***/ 36/*** Page table manipulation functions ***/
36 37
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
502{ 503{
503 unsigned int log; 504 unsigned int log;
504 505
506 if (!vmap_lazy_unmap)
507 return 0;
508
505 log = fls(num_online_cpus()); 509 log = fls(num_online_cpus());
506 510
507 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 511 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -732,7 +736,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
732 node, gfp_mask); 736 node, gfp_mask);
733 if (unlikely(IS_ERR(va))) { 737 if (unlikely(IS_ERR(va))) {
734 kfree(vb); 738 kfree(vb);
735 return ERR_PTR(PTR_ERR(va)); 739 return ERR_CAST(va);
736 } 740 }
737 741
738 err = radix_tree_preload(gfp_mask); 742 err = radix_tree_preload(gfp_mask);
@@ -2437,8 +2441,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2437 unsigned int *ptr = NULL; 2441 unsigned int *ptr = NULL;
2438 int ret; 2442 int ret;
2439 2443
2440 if (NUMA_BUILD) 2444 if (NUMA_BUILD) {
2441 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2445 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2446 if (ptr == NULL)
2447 return -ENOMEM;
2448 }
2442 ret = seq_open(file, &vmalloc_op); 2449 ret = seq_open(file, &vmalloc_op);
2443 if (!ret) { 2450 if (!ret) {
2444 struct seq_file *m = file->private_data; 2451 struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..c391c320dbaf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
48 48
49#include "internal.h" 49#include "internal.h"
50 50
51#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h>
53
51struct scan_control { 54struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 55 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 56 unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
398 /* synchronous write or broken a_ops? */ 401 /* synchronous write or broken a_ops? */
399 ClearPageReclaim(page); 402 ClearPageReclaim(page);
400 } 403 }
404 trace_mm_vmscan_writepage(page,
405 trace_reclaim_flags(page, sync_writeback));
401 inc_zone_page_state(page, NR_VMSCAN_WRITE); 406 inc_zone_page_state(page, NR_VMSCAN_WRITE);
402 return PAGE_SUCCESS; 407 return PAGE_SUCCESS;
403 } 408 }
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
617 return PAGEREF_RECLAIM; 622 return PAGEREF_RECLAIM;
618} 623}
619 624
625static noinline_for_stack void free_page_list(struct list_head *free_pages)
626{
627 struct pagevec freed_pvec;
628 struct page *page, *tmp;
629
630 pagevec_init(&freed_pvec, 1);
631
632 list_for_each_entry_safe(page, tmp, free_pages, lru) {
633 list_del(&page->lru);
634 if (!pagevec_add(&freed_pvec, page)) {
635 __pagevec_free(&freed_pvec);
636 pagevec_reinit(&freed_pvec);
637 }
638 }
639
640 pagevec_free(&freed_pvec);
641}
642
620/* 643/*
621 * shrink_page_list() returns the number of reclaimed pages 644 * shrink_page_list() returns the number of reclaimed pages
622 */ 645 */
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
625 enum pageout_io sync_writeback) 648 enum pageout_io sync_writeback)
626{ 649{
627 LIST_HEAD(ret_pages); 650 LIST_HEAD(ret_pages);
628 struct pagevec freed_pvec; 651 LIST_HEAD(free_pages);
629 int pgactivate = 0; 652 int pgactivate = 0;
630 unsigned long nr_reclaimed = 0; 653 unsigned long nr_reclaimed = 0;
631 654
632 cond_resched(); 655 cond_resched();
633 656
634 pagevec_init(&freed_pvec, 1);
635 while (!list_empty(page_list)) { 657 while (!list_empty(page_list)) {
636 enum page_references references; 658 enum page_references references;
637 struct address_space *mapping; 659 struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 __clear_page_locked(page); 828 __clear_page_locked(page);
807free_it: 829free_it:
808 nr_reclaimed++; 830 nr_reclaimed++;
809 if (!pagevec_add(&freed_pvec, page)) { 831
810 __pagevec_free(&freed_pvec); 832 /*
811 pagevec_reinit(&freed_pvec); 833 * Is there need to periodically free_page_list? It would
812 } 834 * appear not as the counts should be low
835 */
836 list_add(&page->lru, &free_pages);
813 continue; 837 continue;
814 838
815cull_mlocked: 839cull_mlocked:
@@ -832,9 +856,10 @@ keep:
832 list_add(&page->lru, &ret_pages); 856 list_add(&page->lru, &ret_pages);
833 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 857 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
834 } 858 }
859
860 free_page_list(&free_pages);
861
835 list_splice(&ret_pages, page_list); 862 list_splice(&ret_pages, page_list);
836 if (pagevec_count(&freed_pvec))
837 __pagevec_free(&freed_pvec);
838 count_vm_events(PGACTIVATE, pgactivate); 863 count_vm_events(PGACTIVATE, pgactivate);
839 return nr_reclaimed; 864 return nr_reclaimed;
840} 865}
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
916 unsigned long *scanned, int order, int mode, int file) 941 unsigned long *scanned, int order, int mode, int file)
917{ 942{
918 unsigned long nr_taken = 0; 943 unsigned long nr_taken = 0;
944 unsigned long nr_lumpy_taken = 0;
945 unsigned long nr_lumpy_dirty = 0;
946 unsigned long nr_lumpy_failed = 0;
919 unsigned long scan; 947 unsigned long scan;
920 948
921 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 949 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
993 list_move(&cursor_page->lru, dst); 1021 list_move(&cursor_page->lru, dst);
994 mem_cgroup_del_lru(cursor_page); 1022 mem_cgroup_del_lru(cursor_page);
995 nr_taken++; 1023 nr_taken++;
1024 nr_lumpy_taken++;
1025 if (PageDirty(cursor_page))
1026 nr_lumpy_dirty++;
996 scan++; 1027 scan++;
1028 } else {
1029 if (mode == ISOLATE_BOTH &&
1030 page_count(cursor_page))
1031 nr_lumpy_failed++;
997 } 1032 }
998 } 1033 }
999 } 1034 }
1000 1035
1001 *scanned = scan; 1036 *scanned = scan;
1037
1038 trace_mm_vmscan_lru_isolate(order,
1039 nr_to_scan, scan,
1040 nr_taken,
1041 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1042 mode);
1002 return nr_taken; 1043 return nr_taken;
1003} 1044}
1004 1045
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1035 ClearPageActive(page); 1076 ClearPageActive(page);
1036 nr_active++; 1077 nr_active++;
1037 } 1078 }
1038 count[lru]++; 1079 if (count)
1080 count[lru]++;
1039 } 1081 }
1040 1082
1041 return nr_active; 1083 return nr_active;
@@ -1112,174 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
1112} 1154}
1113 1155
1114/* 1156/*
1115 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1157 * TODO: Try merging with migrations version of putback_lru_pages
1116 * of reclaimed pages
1117 */ 1158 */
1118static unsigned long shrink_inactive_list(unsigned long max_scan, 1159static noinline_for_stack void
1119 struct zone *zone, struct scan_control *sc, 1160putback_lru_pages(struct zone *zone, struct scan_control *sc,
1120 int priority, int file) 1161 unsigned long nr_anon, unsigned long nr_file,
1162 struct list_head *page_list)
1121{ 1163{
1122 LIST_HEAD(page_list); 1164 struct page *page;
1123 struct pagevec pvec; 1165 struct pagevec pvec;
1124 unsigned long nr_scanned = 0;
1125 unsigned long nr_reclaimed = 0;
1126 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1166 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1127 1167
1128 while (unlikely(too_many_isolated(zone, file, sc))) { 1168 pagevec_init(&pvec, 1);
1129 congestion_wait(BLK_RW_ASYNC, HZ/10);
1130 1169
1131 /* We are about to die and free our memory. Return now. */ 1170 /*
1132 if (fatal_signal_pending(current)) 1171 * Put back any unfreeable pages.
1133 return SWAP_CLUSTER_MAX; 1172 */
1173 spin_lock(&zone->lru_lock);
1174 while (!list_empty(page_list)) {
1175 int lru;
1176 page = lru_to_page(page_list);
1177 VM_BUG_ON(PageLRU(page));
1178 list_del(&page->lru);
1179 if (unlikely(!page_evictable(page, NULL))) {
1180 spin_unlock_irq(&zone->lru_lock);
1181 putback_lru_page(page);
1182 spin_lock_irq(&zone->lru_lock);
1183 continue;
1184 }
1185 SetPageLRU(page);
1186 lru = page_lru(page);
1187 add_page_to_lru_list(zone, page, lru);
1188 if (is_active_lru(lru)) {
1189 int file = is_file_lru(lru);
1190 reclaim_stat->recent_rotated[file]++;
1191 }
1192 if (!pagevec_add(&pvec, page)) {
1193 spin_unlock_irq(&zone->lru_lock);
1194 __pagevec_release(&pvec);
1195 spin_lock_irq(&zone->lru_lock);
1196 }
1134 } 1197 }
1198 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1199 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1200
1201 spin_unlock_irq(&zone->lru_lock);
1202 pagevec_release(&pvec);
1203}
1135 1204
1205static noinline_for_stack void update_isolated_counts(struct zone *zone,
1206 struct scan_control *sc,
1207 unsigned long *nr_anon,
1208 unsigned long *nr_file,
1209 struct list_head *isolated_list)
1210{
1211 unsigned long nr_active;
1212 unsigned int count[NR_LRU_LISTS] = { 0, };
1213 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1136 1214
1137 pagevec_init(&pvec, 1); 1215 nr_active = clear_active_flags(isolated_list, count);
1216 __count_vm_events(PGDEACTIVATE, nr_active);
1138 1217
1139 lru_add_drain(); 1218 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1140 spin_lock_irq(&zone->lru_lock); 1219 -count[LRU_ACTIVE_FILE]);
1141 do { 1220 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1142 struct page *page; 1221 -count[LRU_INACTIVE_FILE]);
1143 unsigned long nr_taken; 1222 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1144 unsigned long nr_scan; 1223 -count[LRU_ACTIVE_ANON]);
1145 unsigned long nr_freed; 1224 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1146 unsigned long nr_active; 1225 -count[LRU_INACTIVE_ANON]);
1147 unsigned int count[NR_LRU_LISTS] = { 0, };
1148 int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1149 unsigned long nr_anon;
1150 unsigned long nr_file;
1151 1226
1152 if (scanning_global_lru(sc)) { 1227 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1153 nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, 1228 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1154 &page_list, &nr_scan, 1229 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1155 sc->order, mode, 1230 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1156 zone, 0, file);
1157 zone->pages_scanned += nr_scan;
1158 if (current_is_kswapd())
1159 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1160 nr_scan);
1161 else
1162 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1163 nr_scan);
1164 } else {
1165 nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
1166 &page_list, &nr_scan,
1167 sc->order, mode,
1168 zone, sc->mem_cgroup,
1169 0, file);
1170 /*
1171 * mem_cgroup_isolate_pages() keeps track of
1172 * scanned pages on its own.
1173 */
1174 }
1175 1231
1176 if (nr_taken == 0) 1232 reclaim_stat->recent_scanned[0] += *nr_anon;
1177 goto done; 1233 reclaim_stat->recent_scanned[1] += *nr_file;
1234}
1178 1235
1179 nr_active = clear_active_flags(&page_list, count); 1236/*
1180 __count_vm_events(PGDEACTIVATE, nr_active); 1237 * Returns true if the caller should wait to clean dirty/writeback pages.
1238 *
1239 * If we are direct reclaiming for contiguous pages and we do not reclaim
1240 * everything in the list, try again and wait for writeback IO to complete.
1241 * This will stall high-order allocations noticeably. Only do that when really
1242 * need to free the pages under high memory pressure.
1243 */
1244static inline bool should_reclaim_stall(unsigned long nr_taken,
1245 unsigned long nr_freed,
1246 int priority,
1247 struct scan_control *sc)
1248{
1249 int lumpy_stall_priority;
1181 1250
1182 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1251 /* kswapd should not stall on sync IO */
1183 -count[LRU_ACTIVE_FILE]); 1252 if (current_is_kswapd())
1184 __mod_zone_page_state(zone, NR_INACTIVE_FILE, 1253 return false;
1185 -count[LRU_INACTIVE_FILE]);
1186 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1187 -count[LRU_ACTIVE_ANON]);
1188 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1189 -count[LRU_INACTIVE_ANON]);
1190 1254
1191 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1255 /* Only stall on lumpy reclaim */
1192 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1256 if (!sc->lumpy_reclaim_mode)
1193 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); 1257 return false;
1194 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1195 1258
1196 reclaim_stat->recent_scanned[0] += nr_anon; 1259 /* If we have relaimed everything on the isolated list, no stall */
1197 reclaim_stat->recent_scanned[1] += nr_file; 1260 if (nr_freed == nr_taken)
1261 return false;
1198 1262
1199 spin_unlock_irq(&zone->lru_lock); 1263 /*
1264 * For high-order allocations, there are two stall thresholds.
1265 * High-cost allocations stall immediately where as lower
1266 * order allocations such as stacks require the scanning
1267 * priority to be much higher before stalling.
1268 */
1269 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1270 lumpy_stall_priority = DEF_PRIORITY;
1271 else
1272 lumpy_stall_priority = DEF_PRIORITY / 3;
1200 1273
1201 nr_scanned += nr_scan; 1274 return priority <= lumpy_stall_priority;
1202 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1275}
1203 1276
1277/*
1278 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1279 * of reclaimed pages
1280 */
1281static noinline_for_stack unsigned long
1282shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1283 struct scan_control *sc, int priority, int file)
1284{
1285 LIST_HEAD(page_list);
1286 unsigned long nr_scanned;
1287 unsigned long nr_reclaimed = 0;
1288 unsigned long nr_taken;
1289 unsigned long nr_active;
1290 unsigned long nr_anon;
1291 unsigned long nr_file;
1292
1293 while (unlikely(too_many_isolated(zone, file, sc))) {
1294 congestion_wait(BLK_RW_ASYNC, HZ/10);
1295
1296 /* We are about to die and free our memory. Return now. */
1297 if (fatal_signal_pending(current))
1298 return SWAP_CLUSTER_MAX;
1299 }
1300
1301
1302 lru_add_drain();
1303 spin_lock_irq(&zone->lru_lock);
1304
1305 if (scanning_global_lru(sc)) {
1306 nr_taken = isolate_pages_global(nr_to_scan,
1307 &page_list, &nr_scanned, sc->order,
1308 sc->lumpy_reclaim_mode ?
1309 ISOLATE_BOTH : ISOLATE_INACTIVE,
1310 zone, 0, file);
1311 zone->pages_scanned += nr_scanned;
1312 if (current_is_kswapd())
1313 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1314 nr_scanned);
1315 else
1316 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1317 nr_scanned);
1318 } else {
1319 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1320 &page_list, &nr_scanned, sc->order,
1321 sc->lumpy_reclaim_mode ?
1322 ISOLATE_BOTH : ISOLATE_INACTIVE,
1323 zone, sc->mem_cgroup,
1324 0, file);
1204 /* 1325 /*
1205 * If we are direct reclaiming for contiguous pages and we do 1326 * mem_cgroup_isolate_pages() keeps track of
1206 * not reclaim everything in the list, try again and wait 1327 * scanned pages on its own.
1207 * for IO to complete. This will stall high-order allocations
1208 * but that should be acceptable to the caller
1209 */ 1328 */
1210 if (nr_freed < nr_taken && !current_is_kswapd() && 1329 }
1211 sc->lumpy_reclaim_mode) {
1212 congestion_wait(BLK_RW_ASYNC, HZ/10);
1213 1330
1214 /* 1331 if (nr_taken == 0) {
1215 * The attempt at page out may have made some 1332 spin_unlock_irq(&zone->lru_lock);
1216 * of the pages active, mark them inactive again. 1333 return 0;
1217 */ 1334 }
1218 nr_active = clear_active_flags(&page_list, count);
1219 count_vm_events(PGDEACTIVATE, nr_active);
1220 1335
1221 nr_freed += shrink_page_list(&page_list, sc, 1336 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1222 PAGEOUT_IO_SYNC);
1223 }
1224 1337
1225 nr_reclaimed += nr_freed; 1338 spin_unlock_irq(&zone->lru_lock);
1226 1339
1227 local_irq_disable(); 1340 nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
1228 if (current_is_kswapd()) 1341
1229 __count_vm_events(KSWAPD_STEAL, nr_freed); 1342 /* Check if we should syncronously wait for writeback */
1230 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1343 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1344 congestion_wait(BLK_RW_ASYNC, HZ/10);
1231 1345
1232 spin_lock(&zone->lru_lock);
1233 /* 1346 /*
1234 * Put back any unfreeable pages. 1347 * The attempt at page out may have made some
1348 * of the pages active, mark them inactive again.
1235 */ 1349 */
1236 while (!list_empty(&page_list)) { 1350 nr_active = clear_active_flags(&page_list, NULL);
1237 int lru; 1351 count_vm_events(PGDEACTIVATE, nr_active);
1238 page = lru_to_page(&page_list);
1239 VM_BUG_ON(PageLRU(page));
1240 list_del(&page->lru);
1241 if (unlikely(!page_evictable(page, NULL))) {
1242 spin_unlock_irq(&zone->lru_lock);
1243 putback_lru_page(page);
1244 spin_lock_irq(&zone->lru_lock);
1245 continue;
1246 }
1247 SetPageLRU(page);
1248 lru = page_lru(page);
1249 add_page_to_lru_list(zone, page, lru);
1250 if (is_active_lru(lru)) {
1251 int file = is_file_lru(lru);
1252 reclaim_stat->recent_rotated[file]++;
1253 }
1254 if (!pagevec_add(&pvec, page)) {
1255 spin_unlock_irq(&zone->lru_lock);
1256 __pagevec_release(&pvec);
1257 spin_lock_irq(&zone->lru_lock);
1258 }
1259 }
1260 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1261 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1262 1352
1263 } while (nr_scanned < max_scan); 1353 nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
1354 }
1264 1355
1265done: 1356 local_irq_disable();
1266 spin_unlock_irq(&zone->lru_lock); 1357 if (current_is_kswapd())
1267 pagevec_release(&pvec); 1358 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1268 return nr_reclaimed; 1359 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1269}
1270 1360
1271/* 1361 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1272 * We are about to scan this zone at a certain priority level. If that priority 1362 return nr_reclaimed;
1273 * level is smaller (ie: more urgent) than the previous priority, then note
1274 * that priority level within the zone. This is done so that when the next
1275 * process comes in to scan this zone, it will immediately start out at this
1276 * priority level rather than having to build up its own scanning priority.
1277 * Here, this priority affects only the reclaim-mapped threshold.
1278 */
1279static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1280{
1281 if (priority < zone->prev_priority)
1282 zone->prev_priority = priority;
1283} 1363}
1284 1364
1285/* 1365/*
@@ -1583,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1583 } 1663 }
1584 1664
1585 /* 1665 /*
1666 * With swappiness at 100, anonymous and file have the same priority.
1667 * This scanning priority is essentially the inverse of IO cost.
1668 */
1669 anon_prio = sc->swappiness;
1670 file_prio = 200 - sc->swappiness;
1671
1672 /*
1586 * OK, so we have swap space and a fair amount of page cache 1673 * OK, so we have swap space and a fair amount of page cache
1587 * pages. We use the recently rotated / recently scanned 1674 * pages. We use the recently rotated / recently scanned
1588 * ratios to determine how valuable each cache is. 1675 * ratios to determine how valuable each cache is.
@@ -1593,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1593 * 1680 *
1594 * anon in [0], file in [1] 1681 * anon in [0], file in [1]
1595 */ 1682 */
1683 spin_lock_irq(&zone->lru_lock);
1596 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1684 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1597 spin_lock_irq(&zone->lru_lock);
1598 reclaim_stat->recent_scanned[0] /= 2; 1685 reclaim_stat->recent_scanned[0] /= 2;
1599 reclaim_stat->recent_rotated[0] /= 2; 1686 reclaim_stat->recent_rotated[0] /= 2;
1600 spin_unlock_irq(&zone->lru_lock);
1601 } 1687 }
1602 1688
1603 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1689 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1604 spin_lock_irq(&zone->lru_lock);
1605 reclaim_stat->recent_scanned[1] /= 2; 1690 reclaim_stat->recent_scanned[1] /= 2;
1606 reclaim_stat->recent_rotated[1] /= 2; 1691 reclaim_stat->recent_rotated[1] /= 2;
1607 spin_unlock_irq(&zone->lru_lock);
1608 } 1692 }
1609 1693
1610 /* 1694 /*
1611 * With swappiness at 100, anonymous and file have the same priority.
1612 * This scanning priority is essentially the inverse of IO cost.
1613 */
1614 anon_prio = sc->swappiness;
1615 file_prio = 200 - sc->swappiness;
1616
1617 /*
1618 * The amount of pressure on anon vs file pages is inversely 1695 * The amount of pressure on anon vs file pages is inversely
1619 * proportional to the fraction of recently scanned pages on 1696 * proportional to the fraction of recently scanned pages on
1620 * each list that were recently referenced and in active use. 1697 * each list that were recently referenced and in active use.
@@ -1624,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1624 1701
1625 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1702 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1626 fp /= reclaim_stat->recent_rotated[1] + 1; 1703 fp /= reclaim_stat->recent_rotated[1] + 1;
1704 spin_unlock_irq(&zone->lru_lock);
1627 1705
1628 fraction[0] = ap; 1706 fraction[0] = ap;
1629 fraction[1] = fp; 1707 fraction[1] = fp;
@@ -1729,13 +1807,12 @@ static void shrink_zone(int priority, struct zone *zone,
1729static bool shrink_zones(int priority, struct zonelist *zonelist, 1807static bool shrink_zones(int priority, struct zonelist *zonelist,
1730 struct scan_control *sc) 1808 struct scan_control *sc)
1731{ 1809{
1732 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1733 struct zoneref *z; 1810 struct zoneref *z;
1734 struct zone *zone; 1811 struct zone *zone;
1735 bool all_unreclaimable = true; 1812 bool all_unreclaimable = true;
1736 1813
1737 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1814 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1738 sc->nodemask) { 1815 gfp_zone(sc->gfp_mask), sc->nodemask) {
1739 if (!populated_zone(zone)) 1816 if (!populated_zone(zone))
1740 continue; 1817 continue;
1741 /* 1818 /*
@@ -1745,17 +1822,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
1745 if (scanning_global_lru(sc)) { 1822 if (scanning_global_lru(sc)) {
1746 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1823 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1747 continue; 1824 continue;
1748 note_zone_scanning_priority(zone, priority);
1749
1750 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1825 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1751 continue; /* Let kswapd poll it */ 1826 continue; /* Let kswapd poll it */
1752 } else {
1753 /*
1754 * Ignore cpuset limitation here. We just want to reduce
1755 * # of used pages by us regardless of memory shortage.
1756 */
1757 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1758 priority);
1759 } 1827 }
1760 1828
1761 shrink_zone(priority, zone, sc); 1829 shrink_zone(priority, zone, sc);
@@ -1787,10 +1855,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1787 bool all_unreclaimable; 1855 bool all_unreclaimable;
1788 unsigned long total_scanned = 0; 1856 unsigned long total_scanned = 0;
1789 struct reclaim_state *reclaim_state = current->reclaim_state; 1857 struct reclaim_state *reclaim_state = current->reclaim_state;
1790 unsigned long lru_pages = 0;
1791 struct zoneref *z; 1858 struct zoneref *z;
1792 struct zone *zone; 1859 struct zone *zone;
1793 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1794 unsigned long writeback_threshold; 1860 unsigned long writeback_threshold;
1795 1861
1796 get_mems_allowed(); 1862 get_mems_allowed();
@@ -1798,18 +1864,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1798 1864
1799 if (scanning_global_lru(sc)) 1865 if (scanning_global_lru(sc))
1800 count_vm_event(ALLOCSTALL); 1866 count_vm_event(ALLOCSTALL);
1801 /*
1802 * mem_cgroup will not do shrink_slab.
1803 */
1804 if (scanning_global_lru(sc)) {
1805 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1806
1807 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1808 continue;
1809
1810 lru_pages += zone_reclaimable_pages(zone);
1811 }
1812 }
1813 1867
1814 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1868 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1815 sc->nr_scanned = 0; 1869 sc->nr_scanned = 0;
@@ -1821,6 +1875,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1821 * over limit cgroups 1875 * over limit cgroups
1822 */ 1876 */
1823 if (scanning_global_lru(sc)) { 1877 if (scanning_global_lru(sc)) {
1878 unsigned long lru_pages = 0;
1879 for_each_zone_zonelist(zone, z, zonelist,
1880 gfp_zone(sc->gfp_mask)) {
1881 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1882 continue;
1883
1884 lru_pages += zone_reclaimable_pages(zone);
1885 }
1886
1824 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1887 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1825 if (reclaim_state) { 1888 if (reclaim_state) {
1826 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 1889 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1924,6 @@ out:
1861 if (priority < 0) 1924 if (priority < 0)
1862 priority = 0; 1925 priority = 0;
1863 1926
1864 if (scanning_global_lru(sc)) {
1865 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1866
1867 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1868 continue;
1869
1870 zone->prev_priority = priority;
1871 }
1872 } else
1873 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1874
1875 delayacct_freepages_end(); 1927 delayacct_freepages_end();
1876 put_mems_allowed(); 1928 put_mems_allowed();
1877 1929
@@ -1888,6 +1940,7 @@ out:
1888unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 1940unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1889 gfp_t gfp_mask, nodemask_t *nodemask) 1941 gfp_t gfp_mask, nodemask_t *nodemask)
1890{ 1942{
1943 unsigned long nr_reclaimed;
1891 struct scan_control sc = { 1944 struct scan_control sc = {
1892 .gfp_mask = gfp_mask, 1945 .gfp_mask = gfp_mask,
1893 .may_writepage = !laptop_mode, 1946 .may_writepage = !laptop_mode,
@@ -1900,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1900 .nodemask = nodemask, 1953 .nodemask = nodemask,
1901 }; 1954 };
1902 1955
1903 return do_try_to_free_pages(zonelist, &sc); 1956 trace_mm_vmscan_direct_reclaim_begin(order,
1957 sc.may_writepage,
1958 gfp_mask);
1959
1960 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
1961
1962 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
1963
1964 return nr_reclaimed;
1904} 1965}
1905 1966
1906#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1967#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1908,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1908unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 1969unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1909 gfp_t gfp_mask, bool noswap, 1970 gfp_t gfp_mask, bool noswap,
1910 unsigned int swappiness, 1971 unsigned int swappiness,
1911 struct zone *zone, int nid) 1972 struct zone *zone)
1912{ 1973{
1913 struct scan_control sc = { 1974 struct scan_control sc = {
1975 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1914 .may_writepage = !laptop_mode, 1976 .may_writepage = !laptop_mode,
1915 .may_unmap = 1, 1977 .may_unmap = 1,
1916 .may_swap = !noswap, 1978 .may_swap = !noswap,
@@ -1918,13 +1980,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1918 .order = 0, 1980 .order = 0,
1919 .mem_cgroup = mem, 1981 .mem_cgroup = mem,
1920 }; 1982 };
1921 nodemask_t nm = nodemask_of_node(nid);
1922
1923 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1983 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1924 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1984 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1925 sc.nodemask = &nm; 1985
1926 sc.nr_reclaimed = 0; 1986 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
1927 sc.nr_scanned = 0; 1987 sc.may_writepage,
1988 sc.gfp_mask);
1989
1928 /* 1990 /*
1929 * NOTE: Although we can get the priority field, using it 1991 * NOTE: Although we can get the priority field, using it
1930 * here is not a good idea, since it limits the pages we can scan. 1992 * here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1933 * the priority and make it zero. 1995 * the priority and make it zero.
1934 */ 1996 */
1935 shrink_zone(0, zone, &sc); 1997 shrink_zone(0, zone, &sc);
1998
1999 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2000
1936 return sc.nr_reclaimed; 2001 return sc.nr_reclaimed;
1937} 2002}
1938 2003
@@ -1942,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1942 unsigned int swappiness) 2007 unsigned int swappiness)
1943{ 2008{
1944 struct zonelist *zonelist; 2009 struct zonelist *zonelist;
2010 unsigned long nr_reclaimed;
1945 struct scan_control sc = { 2011 struct scan_control sc = {
1946 .may_writepage = !laptop_mode, 2012 .may_writepage = !laptop_mode,
1947 .may_unmap = 1, 2013 .may_unmap = 1,
@@ -1956,7 +2022,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1956 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2022 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1957 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2023 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1958 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2024 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1959 return do_try_to_free_pages(zonelist, &sc); 2025
2026 trace_mm_vmscan_memcg_reclaim_begin(0,
2027 sc.may_writepage,
2028 sc.gfp_mask);
2029
2030 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2031
2032 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2033
2034 return nr_reclaimed;
1960} 2035}
1961#endif 2036#endif
1962 2037
@@ -2028,22 +2103,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2028 .order = order, 2103 .order = order,
2029 .mem_cgroup = NULL, 2104 .mem_cgroup = NULL,
2030 }; 2105 };
2031 /*
2032 * temp_priority is used to remember the scanning priority at which
2033 * this zone was successfully refilled to
2034 * free_pages == high_wmark_pages(zone).
2035 */
2036 int temp_priority[MAX_NR_ZONES];
2037
2038loop_again: 2106loop_again:
2039 total_scanned = 0; 2107 total_scanned = 0;
2040 sc.nr_reclaimed = 0; 2108 sc.nr_reclaimed = 0;
2041 sc.may_writepage = !laptop_mode; 2109 sc.may_writepage = !laptop_mode;
2042 count_vm_event(PAGEOUTRUN); 2110 count_vm_event(PAGEOUTRUN);
2043 2111
2044 for (i = 0; i < pgdat->nr_zones; i++)
2045 temp_priority[i] = DEF_PRIORITY;
2046
2047 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2112 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2048 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2113 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2049 unsigned long lru_pages = 0; 2114 unsigned long lru_pages = 0;
@@ -2103,7 +2168,6 @@ loop_again:
2103 for (i = 0; i <= end_zone; i++) { 2168 for (i = 0; i <= end_zone; i++) {
2104 struct zone *zone = pgdat->node_zones + i; 2169 struct zone *zone = pgdat->node_zones + i;
2105 int nr_slab; 2170 int nr_slab;
2106 int nid, zid;
2107 2171
2108 if (!populated_zone(zone)) 2172 if (!populated_zone(zone))
2109 continue; 2173 continue;
@@ -2111,18 +2175,14 @@ loop_again:
2111 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2175 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2112 continue; 2176 continue;
2113 2177
2114 temp_priority[i] = priority;
2115 sc.nr_scanned = 0; 2178 sc.nr_scanned = 0;
2116 note_zone_scanning_priority(zone, priority);
2117 2179
2118 nid = pgdat->node_id;
2119 zid = zone_idx(zone);
2120 /* 2180 /*
2121 * Call soft limit reclaim before calling shrink_zone. 2181 * Call soft limit reclaim before calling shrink_zone.
2122 * For now we ignore the return value 2182 * For now we ignore the return value
2123 */ 2183 */
2124 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, 2184 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2125 nid, zid); 2185
2126 /* 2186 /*
2127 * We put equal pressure on every zone, unless one 2187 * We put equal pressure on every zone, unless one
2128 * zone has way too many pages free already. 2188 * zone has way too many pages free already.
@@ -2186,16 +2246,6 @@ loop_again:
2186 break; 2246 break;
2187 } 2247 }
2188out: 2248out:
2189 /*
2190 * Note within each zone the priority level at which this zone was
2191 * brought into a happy state. So that the next thread which scans this
2192 * zone will start out at that priority level.
2193 */
2194 for (i = 0; i < pgdat->nr_zones; i++) {
2195 struct zone *zone = pgdat->node_zones + i;
2196
2197 zone->prev_priority = temp_priority[i];
2198 }
2199 if (!all_zones_ok) { 2249 if (!all_zones_ok) {
2200 cond_resched(); 2250 cond_resched();
2201 2251
@@ -2299,9 +2349,10 @@ static int kswapd(void *p)
2299 * premature sleep. If not, then go fully 2349 * premature sleep. If not, then go fully
2300 * to sleep until explicitly woken up 2350 * to sleep until explicitly woken up
2301 */ 2351 */
2302 if (!sleeping_prematurely(pgdat, order, remaining)) 2352 if (!sleeping_prematurely(pgdat, order, remaining)) {
2353 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2303 schedule(); 2354 schedule();
2304 else { 2355 } else {
2305 if (remaining) 2356 if (remaining)
2306 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2357 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2307 else 2358 else
@@ -2321,8 +2372,10 @@ static int kswapd(void *p)
2321 * We can speed up thawing tasks if we don't call balance_pgdat 2372 * We can speed up thawing tasks if we don't call balance_pgdat
2322 * after returning from the refrigerator 2373 * after returning from the refrigerator
2323 */ 2374 */
2324 if (!ret) 2375 if (!ret) {
2376 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2325 balance_pgdat(pgdat, order); 2377 balance_pgdat(pgdat, order);
2378 }
2326 } 2379 }
2327 return 0; 2380 return 0;
2328} 2381}
@@ -2342,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2342 return; 2395 return;
2343 if (pgdat->kswapd_max_order < order) 2396 if (pgdat->kswapd_max_order < order)
2344 pgdat->kswapd_max_order = order; 2397 pgdat->kswapd_max_order = order;
2398 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2345 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2399 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2346 return; 2400 return;
2347 if (!waitqueue_active(&pgdat->kswapd_wait)) 2401 if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2644,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2590 .swappiness = vm_swappiness, 2644 .swappiness = vm_swappiness,
2591 .order = order, 2645 .order = order,
2592 }; 2646 };
2593 unsigned long slab_reclaimable; 2647 unsigned long nr_slab_pages0, nr_slab_pages1;
2594 2648
2595 disable_swap_token();
2596 cond_resched(); 2649 cond_resched();
2597 /* 2650 /*
2598 * We need to be able to allocate from the reserves for RECLAIM_SWAP 2651 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2611 */ 2664 */
2612 priority = ZONE_RECLAIM_PRIORITY; 2665 priority = ZONE_RECLAIM_PRIORITY;
2613 do { 2666 do {
2614 note_zone_scanning_priority(zone, priority);
2615 shrink_zone(priority, zone, &sc); 2667 shrink_zone(priority, zone, &sc);
2616 priority--; 2668 priority--;
2617 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 2669 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2618 } 2670 }
2619 2671
2620 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2672 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2621 if (slab_reclaimable > zone->min_slab_pages) { 2673 if (nr_slab_pages0 > zone->min_slab_pages) {
2622 /* 2674 /*
2623 * shrink_slab() does not currently allow us to determine how 2675 * shrink_slab() does not currently allow us to determine how
2624 * many pages were freed in this zone. So we take the current 2676 * many pages were freed in this zone. So we take the current
@@ -2629,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2629 * Note that shrink_slab will free memory on all zones and may 2681 * Note that shrink_slab will free memory on all zones and may
2630 * take a long time. 2682 * take a long time.
2631 */ 2683 */
2632 while (shrink_slab(sc.nr_scanned, gfp_mask, order) && 2684 for (;;) {
2633 zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 2685 unsigned long lru_pages = zone_reclaimable_pages(zone);
2634 slab_reclaimable - nr_pages) 2686
2635 ; 2687 /* No reclaimable slab or very low memory pressure */
2688 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
2689 break;
2690
2691 /* Freed enough memory */
2692 nr_slab_pages1 = zone_page_state(zone,
2693 NR_SLAB_RECLAIMABLE);
2694 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
2695 break;
2696 }
2636 2697
2637 /* 2698 /*
2638 * Update nr_reclaimed by the number of slab pages we 2699 * Update nr_reclaimed by the number of slab pages we
2639 * reclaimed from this zone. 2700 * reclaimed from this zone.
2640 */ 2701 */
2641 sc.nr_reclaimed += slab_reclaimable - 2702 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2642 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2703 if (nr_slab_pages1 < nr_slab_pages0)
2704 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
2643 } 2705 }
2644 2706
2645 p->reclaim_state = NULL; 2707 p->reclaim_state = NULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941d4e77..f389168f9a83 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,14 +22,14 @@
22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
23EXPORT_PER_CPU_SYMBOL(vm_event_states); 23EXPORT_PER_CPU_SYMBOL(vm_event_states);
24 24
25static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) 25static void sum_vm_events(unsigned long *ret)
26{ 26{
27 int cpu; 27 int cpu;
28 int i; 28 int i;
29 29
30 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 30 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
31 31
32 for_each_cpu(cpu, cpumask) { 32 for_each_online_cpu(cpu) {
33 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 33 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
34 34
35 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 35 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -45,7 +45,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
45void all_vm_events(unsigned long *ret) 45void all_vm_events(unsigned long *ret)
46{ 46{
47 get_online_cpus(); 47 get_online_cpus();
48 sum_vm_events(ret, cpu_online_mask); 48 sum_vm_events(ret);
49 put_online_cpus(); 49 put_online_cpus();
50} 50}
51EXPORT_SYMBOL_GPL(all_vm_events); 51EXPORT_SYMBOL_GPL(all_vm_events);
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
853 } 853 }
854 seq_printf(m, 854 seq_printf(m,
855 "\n all_unreclaimable: %u" 855 "\n all_unreclaimable: %u"
856 "\n prev_priority: %i"
857 "\n start_pfn: %lu" 856 "\n start_pfn: %lu"
858 "\n inactive_ratio: %u", 857 "\n inactive_ratio: %u",
859 zone->all_unreclaimable, 858 zone->all_unreclaimable,
860 zone->prev_priority,
861 zone->zone_start_pfn, 859 zone->zone_start_pfn,
862 zone->inactive_ratio); 860 zone->inactive_ratio);
863 seq_putc(m, '\n'); 861 seq_putc(m, '\n');