diff options
-rw-r--r-- | Documentation/vm/hwpoison.txt | 136 | ||||
-rw-r--r-- | include/linux/prctl.h | 12 | ||||
-rw-r--r-- | kernel/sys.c | 23 | ||||
-rw-r--r-- | kernel/workqueue.c | 21 | ||||
-rw-r--r-- | mm/memory-failure.c | 56 | ||||
-rw-r--r-- | mm/memory.c | 3 |
6 files changed, 216 insertions, 35 deletions
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt new file mode 100644 index 000000000000..3ffadf8da61f --- /dev/null +++ b/Documentation/vm/hwpoison.txt | |||
@@ -0,0 +1,136 @@ | |||
1 | What is hwpoison? | ||
2 | |||
3 | Upcoming Intel CPUs have support for recovering from some memory errors | ||
4 | (``MCA recovery''). This requires the OS to declare a page "poisoned", | ||
5 | kill the processes associated with it and avoid using it in the future. | ||
6 | |||
7 | This patchkit implements the necessary infrastructure in the VM. | ||
8 | |||
9 | To quote the overview comment: | ||
10 | |||
11 | * High level machine check handler. Handles pages reported by the | ||
12 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | ||
13 | * failure. | ||
14 | * | ||
15 | * This focusses on pages detected as corrupted in the background. | ||
16 | * When the current CPU tries to consume corruption the currently | ||
17 | * running process can just be killed directly instead. This implies | ||
18 | * that if the error cannot be handled for some reason it's safe to | ||
19 | * just ignore it because no corruption has been consumed yet. Instead | ||
20 | * when that happens another machine check will happen. | ||
21 | * | ||
22 | * Handles page cache pages in various states. The tricky part | ||
23 | * here is that we can access any page asynchronous to other VM | ||
24 | * users, because memory failures could happen anytime and anywhere, | ||
25 | * possibly violating some of their assumptions. This is why this code | ||
26 | * has to be extremely careful. Generally it tries to use normal locking | ||
27 | * rules, as in get the standard locks, even if that means the | ||
28 | * error handling takes potentially a long time. | ||
29 | * | ||
30 | * Some of the operations here are somewhat inefficient and have non | ||
31 | * linear algorithmic complexity, because the data structures have not | ||
32 | * been optimized for this case. This is in particular the case | ||
33 | * for the mapping from a vma to a process. Since this case is expected | ||
34 | * to be rare we hope we can get away with this. | ||
35 | |||
36 | The code consists of a the high level handler in mm/memory-failure.c, | ||
37 | a new page poison bit and various checks in the VM to handle poisoned | ||
38 | pages. | ||
39 | |||
40 | The main target right now is KVM guests, but it works for all kinds | ||
41 | of applications. KVM support requires a recent qemu-kvm release. | ||
42 | |||
43 | For the KVM use there was need for a new signal type so that | ||
44 | KVM can inject the machine check into the guest with the proper | ||
45 | address. This in theory allows other applications to handle | ||
46 | memory failures too. The expection is that near all applications | ||
47 | won't do that, but some very specialized ones might. | ||
48 | |||
49 | --- | ||
50 | |||
51 | There are two (actually three) modi memory failure recovery can be in: | ||
52 | |||
53 | vm.memory_failure_recovery sysctl set to zero: | ||
54 | All memory failures cause a panic. Do not attempt recovery. | ||
55 | (on x86 this can be also affected by the tolerant level of the | ||
56 | MCE subsystem) | ||
57 | |||
58 | early kill | ||
59 | (can be controlled globally and per process) | ||
60 | Send SIGBUS to the application as soon as the error is detected | ||
61 | This allows applications who can process memory errors in a gentle | ||
62 | way (e.g. drop affected object) | ||
63 | This is the mode used by KVM qemu. | ||
64 | |||
65 | late kill | ||
66 | Send SIGBUS when the application runs into the corrupted page. | ||
67 | This is best for memory error unaware applications and default | ||
68 | Note some pages are always handled as late kill. | ||
69 | |||
70 | --- | ||
71 | |||
72 | User control: | ||
73 | |||
74 | vm.memory_failure_recovery | ||
75 | See sysctl.txt | ||
76 | |||
77 | vm.memory_failure_early_kill | ||
78 | Enable early kill mode globally | ||
79 | |||
80 | PR_MCE_KILL | ||
81 | Set early/late kill mode/revert to system default | ||
82 | arg1: PR_MCE_KILL_CLEAR: Revert to system default | ||
83 | arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode | ||
84 | PR_MCE_KILL_EARLY: Early kill | ||
85 | PR_MCE_KILL_LATE: Late kill | ||
86 | PR_MCE_KILL_DEFAULT: Use system global default | ||
87 | PR_MCE_KILL_GET | ||
88 | return current mode | ||
89 | |||
90 | |||
91 | --- | ||
92 | |||
93 | Testing: | ||
94 | |||
95 | madvise(MADV_POISON, ....) | ||
96 | (as root) | ||
97 | Poison a page in the process for testing | ||
98 | |||
99 | |||
100 | hwpoison-inject module through debugfs | ||
101 | /sys/debug/hwpoison/corrupt-pfn | ||
102 | |||
103 | Inject hwpoison fault at PFN echoed into this file | ||
104 | |||
105 | |||
106 | Architecture specific MCE injector | ||
107 | |||
108 | x86 has mce-inject, mce-test | ||
109 | |||
110 | Some portable hwpoison test programs in mce-test, see blow. | ||
111 | |||
112 | --- | ||
113 | |||
114 | References: | ||
115 | |||
116 | http://halobates.de/mce-lc09-2.pdf | ||
117 | Overview presentation from LinuxCon 09 | ||
118 | |||
119 | git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git | ||
120 | Test suite (hwpoison specific portable tests in tsrc) | ||
121 | |||
122 | git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git | ||
123 | x86 specific injector | ||
124 | |||
125 | |||
126 | --- | ||
127 | |||
128 | Limitations: | ||
129 | |||
130 | - Not all page types are supported and never will. Most kernel internal | ||
131 | objects cannot be recovered, only LRU pages for now. | ||
132 | - Right now hugepage support is missing. | ||
133 | |||
134 | --- | ||
135 | Andi Kleen, Oct 2009 | ||
136 | |||
diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 931150566ade..a3baeb2c2161 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h | |||
@@ -88,6 +88,18 @@ | |||
88 | #define PR_TASK_PERF_EVENTS_DISABLE 31 | 88 | #define PR_TASK_PERF_EVENTS_DISABLE 31 |
89 | #define PR_TASK_PERF_EVENTS_ENABLE 32 | 89 | #define PR_TASK_PERF_EVENTS_ENABLE 32 |
90 | 90 | ||
91 | /* | ||
92 | * Set early/late kill mode for hwpoison memory corruption. | ||
93 | * This influences when the process gets killed on a memory corruption. | ||
94 | */ | ||
91 | #define PR_MCE_KILL 33 | 95 | #define PR_MCE_KILL 33 |
96 | # define PR_MCE_KILL_CLEAR 0 | ||
97 | # define PR_MCE_KILL_SET 1 | ||
98 | |||
99 | # define PR_MCE_KILL_LATE 0 | ||
100 | # define PR_MCE_KILL_EARLY 1 | ||
101 | # define PR_MCE_KILL_DEFAULT 2 | ||
102 | |||
103 | #define PR_MCE_KILL_GET 34 | ||
92 | 104 | ||
93 | #endif /* _LINUX_PRCTL_H */ | 105 | #endif /* _LINUX_PRCTL_H */ |
diff --git a/kernel/sys.c b/kernel/sys.c index 1828f8d10844..ce17760d9c51 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1548,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1548 | if (arg4 | arg5) | 1548 | if (arg4 | arg5) |
1549 | return -EINVAL; | 1549 | return -EINVAL; |
1550 | switch (arg2) { | 1550 | switch (arg2) { |
1551 | case 0: | 1551 | case PR_MCE_KILL_CLEAR: |
1552 | if (arg3 != 0) | 1552 | if (arg3 != 0) |
1553 | return -EINVAL; | 1553 | return -EINVAL; |
1554 | current->flags &= ~PF_MCE_PROCESS; | 1554 | current->flags &= ~PF_MCE_PROCESS; |
1555 | break; | 1555 | break; |
1556 | case 1: | 1556 | case PR_MCE_KILL_SET: |
1557 | current->flags |= PF_MCE_PROCESS; | 1557 | current->flags |= PF_MCE_PROCESS; |
1558 | if (arg3 != 0) | 1558 | if (arg3 == PR_MCE_KILL_EARLY) |
1559 | current->flags |= PF_MCE_EARLY; | 1559 | current->flags |= PF_MCE_EARLY; |
1560 | else | 1560 | else if (arg3 == PR_MCE_KILL_LATE) |
1561 | current->flags &= ~PF_MCE_EARLY; | 1561 | current->flags &= ~PF_MCE_EARLY; |
1562 | else if (arg3 == PR_MCE_KILL_DEFAULT) | ||
1563 | current->flags &= | ||
1564 | ~(PF_MCE_EARLY|PF_MCE_PROCESS); | ||
1565 | else | ||
1566 | return -EINVAL; | ||
1562 | break; | 1567 | break; |
1563 | default: | 1568 | default: |
1564 | return -EINVAL; | 1569 | return -EINVAL; |
1565 | } | 1570 | } |
1566 | error = 0; | 1571 | error = 0; |
1567 | break; | 1572 | break; |
1568 | 1573 | case PR_MCE_KILL_GET: | |
1574 | if (arg2 | arg3 | arg4 | arg5) | ||
1575 | return -EINVAL; | ||
1576 | if (current->flags & PF_MCE_PROCESS) | ||
1577 | error = (current->flags & PF_MCE_EARLY) ? | ||
1578 | PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; | ||
1579 | else | ||
1580 | error = PR_MCE_KILL_DEFAULT; | ||
1581 | break; | ||
1569 | default: | 1582 | default: |
1570 | error = -EINVAL; | 1583 | error = -EINVAL; |
1571 | break; | 1584 | break; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 47cdd7e76f2b..12328147132c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
685 | int schedule_on_each_cpu(work_func_t func) | 685 | int schedule_on_each_cpu(work_func_t func) |
686 | { | 686 | { |
687 | int cpu; | 687 | int cpu; |
688 | int orig = -1; | ||
688 | struct work_struct *works; | 689 | struct work_struct *works; |
689 | 690 | ||
690 | works = alloc_percpu(struct work_struct); | 691 | works = alloc_percpu(struct work_struct); |
691 | if (!works) | 692 | if (!works) |
692 | return -ENOMEM; | 693 | return -ENOMEM; |
693 | 694 | ||
695 | /* | ||
696 | * when running in keventd don't schedule a work item on itself. | ||
697 | * Can just call directly because the work queue is already bound. | ||
698 | * This also is faster. | ||
699 | * Make this a generic parameter for other workqueues? | ||
700 | */ | ||
701 | if (current_is_keventd()) { | ||
702 | orig = raw_smp_processor_id(); | ||
703 | INIT_WORK(per_cpu_ptr(works, orig), func); | ||
704 | func(per_cpu_ptr(works, orig)); | ||
705 | } | ||
706 | |||
694 | get_online_cpus(); | 707 | get_online_cpus(); |
695 | for_each_online_cpu(cpu) { | 708 | for_each_online_cpu(cpu) { |
696 | struct work_struct *work = per_cpu_ptr(works, cpu); | 709 | struct work_struct *work = per_cpu_ptr(works, cpu); |
697 | 710 | ||
711 | if (cpu == orig) | ||
712 | continue; | ||
698 | INIT_WORK(work, func); | 713 | INIT_WORK(work, func); |
699 | schedule_work_on(cpu, work); | 714 | schedule_work_on(cpu, work); |
700 | } | 715 | } |
701 | for_each_online_cpu(cpu) | 716 | for_each_online_cpu(cpu) { |
702 | flush_work(per_cpu_ptr(works, cpu)); | 717 | if (cpu != orig) |
718 | flush_work(per_cpu_ptr(works, cpu)); | ||
719 | } | ||
703 | put_online_cpus(); | 720 | put_online_cpus(); |
704 | free_percpu(works); | 721 | free_percpu(works); |
705 | return 0; | 722 | return 0; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7fc2130d2737..dacc64183874 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -371,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
371 | int ret = FAILED; | 371 | int ret = FAILED; |
372 | struct address_space *mapping; | 372 | struct address_space *mapping; |
373 | 373 | ||
374 | if (!isolate_lru_page(p)) | ||
375 | page_cache_release(p); | ||
376 | |||
377 | /* | 374 | /* |
378 | * For anonymous pages we're done the only reference left | 375 | * For anonymous pages we're done the only reference left |
379 | * should be the one m_f() holds. | 376 | * should be the one m_f() holds. |
@@ -499,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) | |||
499 | */ | 496 | */ |
500 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) | 497 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) |
501 | { | 498 | { |
502 | int ret = FAILED; | ||
503 | |||
504 | ClearPageDirty(p); | 499 | ClearPageDirty(p); |
505 | /* Trigger EIO in shmem: */ | 500 | /* Trigger EIO in shmem: */ |
506 | ClearPageUptodate(p); | 501 | ClearPageUptodate(p); |
507 | 502 | ||
508 | if (!isolate_lru_page(p)) { | 503 | return DELAYED; |
509 | page_cache_release(p); | ||
510 | ret = DELAYED; | ||
511 | } | ||
512 | |||
513 | return ret; | ||
514 | } | 504 | } |
515 | 505 | ||
516 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 506 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
517 | { | 507 | { |
518 | int ret = FAILED; | ||
519 | |||
520 | if (!isolate_lru_page(p)) { | ||
521 | page_cache_release(p); | ||
522 | ret = RECOVERED; | ||
523 | } | ||
524 | delete_from_swap_cache(p); | 508 | delete_from_swap_cache(p); |
525 | return ret; | 509 | |
510 | return RECOVERED; | ||
526 | } | 511 | } |
527 | 512 | ||
528 | /* | 513 | /* |
@@ -612,8 +597,6 @@ static struct page_state { | |||
612 | { 0, 0, "unknown page state", me_unknown }, | 597 | { 0, 0, "unknown page state", me_unknown }, |
613 | }; | 598 | }; |
614 | 599 | ||
615 | #undef lru | ||
616 | |||
617 | static void action_result(unsigned long pfn, char *msg, int result) | 600 | static void action_result(unsigned long pfn, char *msg, int result) |
618 | { | 601 | { |
619 | struct page *page = NULL; | 602 | struct page *page = NULL; |
@@ -630,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p, | |||
630 | unsigned long pfn, int ref) | 613 | unsigned long pfn, int ref) |
631 | { | 614 | { |
632 | int result; | 615 | int result; |
616 | int count; | ||
633 | 617 | ||
634 | result = ps->action(p, pfn); | 618 | result = ps->action(p, pfn); |
635 | action_result(pfn, ps->msg, result); | 619 | action_result(pfn, ps->msg, result); |
636 | if (page_count(p) != 1 + ref) | 620 | |
621 | count = page_count(p) - 1 - ref; | ||
622 | if (count != 0) | ||
637 | printk(KERN_ERR | 623 | printk(KERN_ERR |
638 | "MCE %#lx: %s page still referenced by %d users\n", | 624 | "MCE %#lx: %s page still referenced by %d users\n", |
639 | pfn, ps->msg, page_count(p) - 1); | 625 | pfn, ps->msg, count); |
640 | 626 | ||
641 | /* Could do more checks here if page looks ok */ | 627 | /* Could do more checks here if page looks ok */ |
642 | /* | 628 | /* |
@@ -665,9 +651,6 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
665 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) | 651 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) |
666 | return; | 652 | return; |
667 | 653 | ||
668 | if (!PageLRU(p)) | ||
669 | lru_add_drain_all(); | ||
670 | |||
671 | /* | 654 | /* |
672 | * This check implies we don't kill processes if their pages | 655 | * This check implies we don't kill processes if their pages |
673 | * are in the swap cache early. Those are always late kills. | 656 | * are in the swap cache early. Those are always late kills. |
@@ -739,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
739 | 722 | ||
740 | int __memory_failure(unsigned long pfn, int trapno, int ref) | 723 | int __memory_failure(unsigned long pfn, int trapno, int ref) |
741 | { | 724 | { |
725 | unsigned long lru_flag; | ||
742 | struct page_state *ps; | 726 | struct page_state *ps; |
743 | struct page *p; | 727 | struct page *p; |
744 | int res; | 728 | int res; |
@@ -776,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
776 | } | 760 | } |
777 | 761 | ||
778 | /* | 762 | /* |
763 | * We ignore non-LRU pages for good reasons. | ||
764 | * - PG_locked is only well defined for LRU pages and a few others | ||
765 | * - to avoid races with __set_page_locked() | ||
766 | * - to avoid races with __SetPageSlab*() (and more non-atomic ops) | ||
767 | * The check (unnecessarily) ignores LRU pages being isolated and | ||
768 | * walked by the page reclaim code, however that's not a big loss. | ||
769 | */ | ||
770 | if (!PageLRU(p)) | ||
771 | lru_add_drain_all(); | ||
772 | lru_flag = p->flags & lru; | ||
773 | if (isolate_lru_page(p)) { | ||
774 | action_result(pfn, "non LRU", IGNORED); | ||
775 | put_page(p); | ||
776 | return -EBUSY; | ||
777 | } | ||
778 | page_cache_release(p); | ||
779 | |||
780 | /* | ||
779 | * Lock the page and wait for writeback to finish. | 781 | * Lock the page and wait for writeback to finish. |
780 | * It's very difficult to mess with pages currently under IO | 782 | * It's very difficult to mess with pages currently under IO |
781 | * and in many cases impossible, so we just avoid it here. | 783 | * and in many cases impossible, so we just avoid it here. |
@@ -791,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
791 | /* | 793 | /* |
792 | * Torn down by someone else? | 794 | * Torn down by someone else? |
793 | */ | 795 | */ |
794 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | 796 | if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { |
795 | action_result(pfn, "already truncated LRU", IGNORED); | 797 | action_result(pfn, "already truncated LRU", IGNORED); |
796 | res = 0; | 798 | res = 0; |
797 | goto out; | 799 | goto out; |
@@ -799,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
799 | 801 | ||
800 | res = -EBUSY; | 802 | res = -EBUSY; |
801 | for (ps = error_states;; ps++) { | 803 | for (ps = error_states;; ps++) { |
802 | if ((p->flags & ps->mask) == ps->res) { | 804 | if (((p->flags | lru_flag)& ps->mask) == ps->res) { |
803 | res = page_action(ps, p, pfn, ref); | 805 | res = page_action(ps, p, pfn, ref); |
804 | break; | 806 | break; |
805 | } | 807 | } |
diff --git a/mm/memory.c b/mm/memory.c index 60ea601e03ea..6ab19dd4a199 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2542,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2542 | } else if (PageHWPoison(page)) { | 2542 | } else if (PageHWPoison(page)) { |
2543 | ret = VM_FAULT_HWPOISON; | 2543 | ret = VM_FAULT_HWPOISON; |
2544 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2544 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2545 | goto out; | 2545 | goto out_release; |
2546 | } | 2546 | } |
2547 | 2547 | ||
2548 | lock_page(page); | 2548 | lock_page(page); |
@@ -2614,6 +2614,7 @@ out_nomap: | |||
2614 | pte_unmap_unlock(page_table, ptl); | 2614 | pte_unmap_unlock(page_table, ptl); |
2615 | out_page: | 2615 | out_page: |
2616 | unlock_page(page); | 2616 | unlock_page(page); |
2617 | out_release: | ||
2617 | page_cache_release(page); | 2618 | page_cache_release(page); |
2618 | return ret; | 2619 | return ret; |
2619 | } | 2620 | } |