diff options
-rw-r--r-- | include/linux/mm.h | 2 | ||||
-rw-r--r-- | mm/internal.h | 5 | ||||
-rw-r--r-- | mm/memory.c | 17 | ||||
-rw-r--r-- | mm/oom_kill.c | 151 |
4 files changed, 162 insertions, 13 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index 450fc977ed02..ed6407d1b7b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1132,6 +1132,8 @@ struct zap_details { | |||
1132 | struct address_space *check_mapping; /* Check page->mapping if set */ | 1132 | struct address_space *check_mapping; /* Check page->mapping if set */ |
1133 | pgoff_t first_index; /* Lowest page->index to unmap */ | 1133 | pgoff_t first_index; /* Lowest page->index to unmap */ |
1134 | pgoff_t last_index; /* Highest page->index to unmap */ | 1134 | pgoff_t last_index; /* Highest page->index to unmap */ |
1135 | bool ignore_dirty; /* Ignore dirty pages */ | ||
1136 | bool check_swap_entries; /* Check also swap entries */ | ||
1135 | }; | 1137 | }; |
1136 | 1138 | ||
1137 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 1139 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
diff --git a/mm/internal.h b/mm/internal.h index 7449392c6faa..b79abb6721cf 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -38,6 +38,11 @@ | |||
38 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 38 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
39 | unsigned long floor, unsigned long ceiling); | 39 | unsigned long floor, unsigned long ceiling); |
40 | 40 | ||
41 | void unmap_page_range(struct mmu_gather *tlb, | ||
42 | struct vm_area_struct *vma, | ||
43 | unsigned long addr, unsigned long end, | ||
44 | struct zap_details *details); | ||
45 | |||
41 | extern int __do_page_cache_readahead(struct address_space *mapping, | 46 | extern int __do_page_cache_readahead(struct address_space *mapping, |
42 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, | 47 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, |
43 | unsigned long lookahead_size); | 48 | unsigned long lookahead_size); |
diff --git a/mm/memory.c b/mm/memory.c index 81dca0083fcd..098f00d05461 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1102,6 +1102,12 @@ again: | |||
1102 | 1102 | ||
1103 | if (!PageAnon(page)) { | 1103 | if (!PageAnon(page)) { |
1104 | if (pte_dirty(ptent)) { | 1104 | if (pte_dirty(ptent)) { |
1105 | /* | ||
1106 | * oom_reaper cannot tear down dirty | ||
1107 | * pages | ||
1108 | */ | ||
1109 | if (unlikely(details && details->ignore_dirty)) | ||
1110 | continue; | ||
1105 | force_flush = 1; | 1111 | force_flush = 1; |
1106 | set_page_dirty(page); | 1112 | set_page_dirty(page); |
1107 | } | 1113 | } |
@@ -1120,8 +1126,8 @@ again: | |||
1120 | } | 1126 | } |
1121 | continue; | 1127 | continue; |
1122 | } | 1128 | } |
1123 | /* If details->check_mapping, we leave swap entries. */ | 1129 | /* only check swap_entries if explicitly asked for in details */ |
1124 | if (unlikely(details)) | 1130 | if (unlikely(details && !details->check_swap_entries)) |
1125 | continue; | 1131 | continue; |
1126 | 1132 | ||
1127 | entry = pte_to_swp_entry(ptent); | 1133 | entry = pte_to_swp_entry(ptent); |
@@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, | |||
1226 | return addr; | 1232 | return addr; |
1227 | } | 1233 | } |
1228 | 1234 | ||
1229 | static void unmap_page_range(struct mmu_gather *tlb, | 1235 | void unmap_page_range(struct mmu_gather *tlb, |
1230 | struct vm_area_struct *vma, | 1236 | struct vm_area_struct *vma, |
1231 | unsigned long addr, unsigned long end, | 1237 | unsigned long addr, unsigned long end, |
1232 | struct zap_details *details) | 1238 | struct zap_details *details) |
@@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
1234 | pgd_t *pgd; | 1240 | pgd_t *pgd; |
1235 | unsigned long next; | 1241 | unsigned long next; |
1236 | 1242 | ||
1237 | if (details && !details->check_mapping) | ||
1238 | details = NULL; | ||
1239 | |||
1240 | BUG_ON(addr >= end); | 1243 | BUG_ON(addr >= end); |
1241 | tlb_start_vma(tlb, vma); | 1244 | tlb_start_vma(tlb, vma); |
1242 | pgd = pgd_offset(vma->vm_mm, addr); | 1245 | pgd = pgd_offset(vma->vm_mm, addr); |
@@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
2432 | void unmap_mapping_range(struct address_space *mapping, | 2435 | void unmap_mapping_range(struct address_space *mapping, |
2433 | loff_t const holebegin, loff_t const holelen, int even_cows) | 2436 | loff_t const holebegin, loff_t const holelen, int even_cows) |
2434 | { | 2437 | { |
2435 | struct zap_details details; | 2438 | struct zap_details details = { }; |
2436 | pgoff_t hba = holebegin >> PAGE_SHIFT; | 2439 | pgoff_t hba = holebegin >> PAGE_SHIFT; |
2437 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; | 2440 | pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; |
2438 | 2441 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 06f7e1707847..f7ed6ece0719 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -35,6 +35,11 @@ | |||
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/ftrace.h> | 36 | #include <linux/ftrace.h> |
37 | #include <linux/ratelimit.h> | 37 | #include <linux/ratelimit.h> |
38 | #include <linux/kthread.h> | ||
39 | #include <linux/init.h> | ||
40 | |||
41 | #include <asm/tlb.h> | ||
42 | #include "internal.h" | ||
38 | 43 | ||
39 | #define CREATE_TRACE_POINTS | 44 | #define CREATE_TRACE_POINTS |
40 | #include <trace/events/oom.h> | 45 | #include <trace/events/oom.h> |
@@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | |||
405 | 410 | ||
406 | bool oom_killer_disabled __read_mostly; | 411 | bool oom_killer_disabled __read_mostly; |
407 | 412 | ||
413 | #ifdef CONFIG_MMU | ||
414 | /* | ||
415 | * OOM Reaper kernel thread which tries to reap the memory used by the OOM | ||
416 | * victim (if that is possible) to help the OOM killer to move on. | ||
417 | */ | ||
418 | static struct task_struct *oom_reaper_th; | ||
419 | static struct mm_struct *mm_to_reap; | ||
420 | static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); | ||
421 | |||
422 | static bool __oom_reap_vmas(struct mm_struct *mm) | ||
423 | { | ||
424 | struct mmu_gather tlb; | ||
425 | struct vm_area_struct *vma; | ||
426 | struct zap_details details = {.check_swap_entries = true, | ||
427 | .ignore_dirty = true}; | ||
428 | bool ret = true; | ||
429 | |||
430 | /* We might have raced with exit path */ | ||
431 | if (!atomic_inc_not_zero(&mm->mm_users)) | ||
432 | return true; | ||
433 | |||
434 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
435 | ret = false; | ||
436 | goto out; | ||
437 | } | ||
438 | |||
439 | tlb_gather_mmu(&tlb, mm, 0, -1); | ||
440 | for (vma = mm->mmap ; vma; vma = vma->vm_next) { | ||
441 | if (is_vm_hugetlb_page(vma)) | ||
442 | continue; | ||
443 | |||
444 | /* | ||
445 | * mlocked VMAs require explicit munlocking before unmap. | ||
446 | * Let's keep it simple here and skip such VMAs. | ||
447 | */ | ||
448 | if (vma->vm_flags & VM_LOCKED) | ||
449 | continue; | ||
450 | |||
451 | /* | ||
452 | * Only anonymous pages have a good chance to be dropped | ||
453 | * without additional steps which we cannot afford as we | ||
454 | * are OOM already. | ||
455 | * | ||
456 | * We do not even care about fs backed pages because all | ||
457 | * which are reclaimable have already been reclaimed and | ||
458 | * we do not want to block exit_mmap by keeping mm ref | ||
459 | * count elevated without a good reason. | ||
460 | */ | ||
461 | if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) | ||
462 | unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, | ||
463 | &details); | ||
464 | } | ||
465 | tlb_finish_mmu(&tlb, 0, -1); | ||
466 | up_read(&mm->mmap_sem); | ||
467 | out: | ||
468 | mmput(mm); | ||
469 | return ret; | ||
470 | } | ||
471 | |||
472 | static void oom_reap_vmas(struct mm_struct *mm) | ||
473 | { | ||
474 | int attempts = 0; | ||
475 | |||
476 | /* Retry the down_read_trylock(mmap_sem) a few times */ | ||
477 | while (attempts++ < 10 && !__oom_reap_vmas(mm)) | ||
478 | schedule_timeout_idle(HZ/10); | ||
479 | |||
480 | /* Drop a reference taken by wake_oom_reaper */ | ||
481 | mmdrop(mm); | ||
482 | } | ||
483 | |||
484 | static int oom_reaper(void *unused) | ||
485 | { | ||
486 | while (true) { | ||
487 | struct mm_struct *mm; | ||
488 | |||
489 | wait_event_freezable(oom_reaper_wait, | ||
490 | (mm = READ_ONCE(mm_to_reap))); | ||
491 | oom_reap_vmas(mm); | ||
492 | WRITE_ONCE(mm_to_reap, NULL); | ||
493 | } | ||
494 | |||
495 | return 0; | ||
496 | } | ||
497 | |||
498 | static void wake_oom_reaper(struct mm_struct *mm) | ||
499 | { | ||
500 | struct mm_struct *old_mm; | ||
501 | |||
502 | if (!oom_reaper_th) | ||
503 | return; | ||
504 | |||
505 | /* | ||
506 | * Pin the given mm. Use mm_count instead of mm_users because | ||
507 | * we do not want to delay the address space tear down. | ||
508 | */ | ||
509 | atomic_inc(&mm->mm_count); | ||
510 | |||
511 | /* | ||
512 | * Make sure that only a single mm is ever queued for the reaper | ||
513 | * because multiple are not necessary and the operation might be | ||
514 | * disruptive so better reduce it to the bare minimum. | ||
515 | */ | ||
516 | old_mm = cmpxchg(&mm_to_reap, NULL, mm); | ||
517 | if (!old_mm) | ||
518 | wake_up(&oom_reaper_wait); | ||
519 | else | ||
520 | mmdrop(mm); | ||
521 | } | ||
522 | |||
523 | static int __init oom_init(void) | ||
524 | { | ||
525 | oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); | ||
526 | if (IS_ERR(oom_reaper_th)) { | ||
527 | pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", | ||
528 | PTR_ERR(oom_reaper_th)); | ||
529 | oom_reaper_th = NULL; | ||
530 | } | ||
531 | return 0; | ||
532 | } | ||
533 | subsys_initcall(oom_init) | ||
534 | #else | ||
535 | static void wake_oom_reaper(struct mm_struct *mm) | ||
536 | { | ||
537 | } | ||
538 | #endif | ||
539 | |||
408 | /** | 540 | /** |
409 | * mark_oom_victim - mark the given task as OOM victim | 541 | * mark_oom_victim - mark the given task as OOM victim |
410 | * @tsk: task to mark | 542 | * @tsk: task to mark |
@@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
510 | unsigned int victim_points = 0; | 642 | unsigned int victim_points = 0; |
511 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 643 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
512 | DEFAULT_RATELIMIT_BURST); | 644 | DEFAULT_RATELIMIT_BURST); |
645 | bool can_oom_reap = true; | ||
513 | 646 | ||
514 | /* | 647 | /* |
515 | * If the task is already exiting, don't alarm the sysadmin or kill | 648 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
600 | continue; | 733 | continue; |
601 | if (same_thread_group(p, victim)) | 734 | if (same_thread_group(p, victim)) |
602 | continue; | 735 | continue; |
603 | if (unlikely(p->flags & PF_KTHREAD)) | 736 | if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || |
604 | continue; | 737 | p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { |
605 | if (is_global_init(p)) | 738 | /* |
606 | continue; | 739 | * We cannot use oom_reaper for the mm shared by this |
607 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | 740 | * process because it wouldn't get killed and so the |
741 | * memory might be still used. | ||
742 | */ | ||
743 | can_oom_reap = false; | ||
608 | continue; | 744 | continue; |
609 | 745 | } | |
610 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 746 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
611 | } | 747 | } |
612 | rcu_read_unlock(); | 748 | rcu_read_unlock(); |
613 | 749 | ||
750 | if (can_oom_reap) | ||
751 | wake_oom_reaper(mm); | ||
752 | |||
614 | mmdrop(mm); | 753 | mmdrop(mm); |
615 | put_task_struct(victim); | 754 | put_task_struct(victim); |
616 | } | 755 | } |