aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/vm/hwpoison.txt136
-rw-r--r--include/linux/prctl.h12
-rw-r--r--kernel/sys.c23
-rw-r--r--kernel/workqueue.c21
-rw-r--r--mm/memory-failure.c56
-rw-r--r--mm/memory.c3
6 files changed, 216 insertions, 35 deletions
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
new file mode 100644
index 000000000000..3ffadf8da61f
--- /dev/null
+++ b/Documentation/vm/hwpoison.txt
@@ -0,0 +1,136 @@
1What is hwpoison?
2
3Upcoming Intel CPUs have support for recovering from some memory errors
4(``MCA recovery''). This requires the OS to declare a page "poisoned",
5kill the processes associated with it and avoid using it in the future.
6
7This patchkit implements the necessary infrastructure in the VM.
8
9To quote the overview comment:
10
11 * High level machine check handler. Handles pages reported by the
12 * hardware as being corrupted usually due to a 2bit ECC memory or cache
13 * failure.
14 *
15 * This focusses on pages detected as corrupted in the background.
16 * When the current CPU tries to consume corruption the currently
17 * running process can just be killed directly instead. This implies
18 * that if the error cannot be handled for some reason it's safe to
19 * just ignore it because no corruption has been consumed yet. Instead
20 * when that happens another machine check will happen.
21 *
22 * Handles page cache pages in various states. The tricky part
23 * here is that we can access any page asynchronous to other VM
24 * users, because memory failures could happen anytime and anywhere,
25 * possibly violating some of their assumptions. This is why this code
26 * has to be extremely careful. Generally it tries to use normal locking
27 * rules, as in get the standard locks, even if that means the
28 * error handling takes potentially a long time.
29 *
30 * Some of the operations here are somewhat inefficient and have non
31 * linear algorithmic complexity, because the data structures have not
32 * been optimized for this case. This is in particular the case
33 * for the mapping from a vma to a process. Since this case is expected
34 * to be rare we hope we can get away with this.
35
36The code consists of a the high level handler in mm/memory-failure.c,
37a new page poison bit and various checks in the VM to handle poisoned
38pages.
39
40The main target right now is KVM guests, but it works for all kinds
41of applications. KVM support requires a recent qemu-kvm release.
42
43For the KVM use there was need for a new signal type so that
44KVM can inject the machine check into the guest with the proper
45address. This in theory allows other applications to handle
46memory failures too. The expection is that near all applications
47won't do that, but some very specialized ones might.
48
49---
50
51There are two (actually three) modi memory failure recovery can be in:
52
53vm.memory_failure_recovery sysctl set to zero:
54 All memory failures cause a panic. Do not attempt recovery.
55 (on x86 this can be also affected by the tolerant level of the
56 MCE subsystem)
57
58early kill
59 (can be controlled globally and per process)
60 Send SIGBUS to the application as soon as the error is detected
61 This allows applications who can process memory errors in a gentle
62 way (e.g. drop affected object)
63 This is the mode used by KVM qemu.
64
65late kill
66 Send SIGBUS when the application runs into the corrupted page.
67 This is best for memory error unaware applications and default
68 Note some pages are always handled as late kill.
69
70---
71
72User control:
73
74vm.memory_failure_recovery
75 See sysctl.txt
76
77vm.memory_failure_early_kill
78 Enable early kill mode globally
79
80PR_MCE_KILL
81 Set early/late kill mode/revert to system default
82 arg1: PR_MCE_KILL_CLEAR: Revert to system default
83 arg1: PR_MCE_KILL_SET: arg2 defines thread specific mode
84 PR_MCE_KILL_EARLY: Early kill
85 PR_MCE_KILL_LATE: Late kill
86 PR_MCE_KILL_DEFAULT: Use system global default
87PR_MCE_KILL_GET
88 return current mode
89
90
91---
92
93Testing:
94
95madvise(MADV_POISON, ....)
96 (as root)
97 Poison a page in the process for testing
98
99
100hwpoison-inject module through debugfs
101 /sys/debug/hwpoison/corrupt-pfn
102
103Inject hwpoison fault at PFN echoed into this file
104
105
106Architecture specific MCE injector
107
108x86 has mce-inject, mce-test
109
110Some portable hwpoison test programs in mce-test, see blow.
111
112---
113
114References:
115
116http://halobates.de/mce-lc09-2.pdf
117 Overview presentation from LinuxCon 09
118
119git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
120 Test suite (hwpoison specific portable tests in tsrc)
121
122git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
123 x86 specific injector
124
125
126---
127
128Limitations:
129
130- Not all page types are supported and never will. Most kernel internal
131objects cannot be recovered, only LRU pages for now.
132- Right now hugepage support is missing.
133
134---
135Andi Kleen, Oct 2009
136
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 931150566ade..a3baeb2c2161 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,6 +88,18 @@
88#define PR_TASK_PERF_EVENTS_DISABLE 31 88#define PR_TASK_PERF_EVENTS_DISABLE 31
89#define PR_TASK_PERF_EVENTS_ENABLE 32 89#define PR_TASK_PERF_EVENTS_ENABLE 32
90 90
91/*
92 * Set early/late kill mode for hwpoison memory corruption.
93 * This influences when the process gets killed on a memory corruption.
94 */
91#define PR_MCE_KILL 33 95#define PR_MCE_KILL 33
96# define PR_MCE_KILL_CLEAR 0
97# define PR_MCE_KILL_SET 1
98
99# define PR_MCE_KILL_LATE 0
100# define PR_MCE_KILL_EARLY 1
101# define PR_MCE_KILL_DEFAULT 2
102
103#define PR_MCE_KILL_GET 34
92 104
93#endif /* _LINUX_PRCTL_H */ 105#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 1828f8d10844..ce17760d9c51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1548,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1548 if (arg4 | arg5) 1548 if (arg4 | arg5)
1549 return -EINVAL; 1549 return -EINVAL;
1550 switch (arg2) { 1550 switch (arg2) {
1551 case 0: 1551 case PR_MCE_KILL_CLEAR:
1552 if (arg3 != 0) 1552 if (arg3 != 0)
1553 return -EINVAL; 1553 return -EINVAL;
1554 current->flags &= ~PF_MCE_PROCESS; 1554 current->flags &= ~PF_MCE_PROCESS;
1555 break; 1555 break;
1556 case 1: 1556 case PR_MCE_KILL_SET:
1557 current->flags |= PF_MCE_PROCESS; 1557 current->flags |= PF_MCE_PROCESS;
1558 if (arg3 != 0) 1558 if (arg3 == PR_MCE_KILL_EARLY)
1559 current->flags |= PF_MCE_EARLY; 1559 current->flags |= PF_MCE_EARLY;
1560 else 1560 else if (arg3 == PR_MCE_KILL_LATE)
1561 current->flags &= ~PF_MCE_EARLY; 1561 current->flags &= ~PF_MCE_EARLY;
1562 else if (arg3 == PR_MCE_KILL_DEFAULT)
1563 current->flags &=
1564 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
1565 else
1566 return -EINVAL;
1562 break; 1567 break;
1563 default: 1568 default:
1564 return -EINVAL; 1569 return -EINVAL;
1565 } 1570 }
1566 error = 0; 1571 error = 0;
1567 break; 1572 break;
1568 1573 case PR_MCE_KILL_GET:
1574 if (arg2 | arg3 | arg4 | arg5)
1575 return -EINVAL;
1576 if (current->flags & PF_MCE_PROCESS)
1577 error = (current->flags & PF_MCE_EARLY) ?
1578 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
1579 else
1580 error = PR_MCE_KILL_DEFAULT;
1581 break;
1569 default: 1582 default:
1570 error = -EINVAL; 1583 error = -EINVAL;
1571 break; 1584 break;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 47cdd7e76f2b..12328147132c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
685int schedule_on_each_cpu(work_func_t func) 685int schedule_on_each_cpu(work_func_t func)
686{ 686{
687 int cpu; 687 int cpu;
688 int orig = -1;
688 struct work_struct *works; 689 struct work_struct *works;
689 690
690 works = alloc_percpu(struct work_struct); 691 works = alloc_percpu(struct work_struct);
691 if (!works) 692 if (!works)
692 return -ENOMEM; 693 return -ENOMEM;
693 694
695 /*
696 * when running in keventd don't schedule a work item on itself.
697 * Can just call directly because the work queue is already bound.
698 * This also is faster.
699 * Make this a generic parameter for other workqueues?
700 */
701 if (current_is_keventd()) {
702 orig = raw_smp_processor_id();
703 INIT_WORK(per_cpu_ptr(works, orig), func);
704 func(per_cpu_ptr(works, orig));
705 }
706
694 get_online_cpus(); 707 get_online_cpus();
695 for_each_online_cpu(cpu) { 708 for_each_online_cpu(cpu) {
696 struct work_struct *work = per_cpu_ptr(works, cpu); 709 struct work_struct *work = per_cpu_ptr(works, cpu);
697 710
711 if (cpu == orig)
712 continue;
698 INIT_WORK(work, func); 713 INIT_WORK(work, func);
699 schedule_work_on(cpu, work); 714 schedule_work_on(cpu, work);
700 } 715 }
701 for_each_online_cpu(cpu) 716 for_each_online_cpu(cpu) {
702 flush_work(per_cpu_ptr(works, cpu)); 717 if (cpu != orig)
718 flush_work(per_cpu_ptr(works, cpu));
719 }
703 put_online_cpus(); 720 put_online_cpus();
704 free_percpu(works); 721 free_percpu(works);
705 return 0; 722 return 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7fc2130d2737..dacc64183874 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -371,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
371 int ret = FAILED; 371 int ret = FAILED;
372 struct address_space *mapping; 372 struct address_space *mapping;
373 373
374 if (!isolate_lru_page(p))
375 page_cache_release(p);
376
377 /* 374 /*
378 * For anonymous pages we're done the only reference left 375 * For anonymous pages we're done the only reference left
379 * should be the one m_f() holds. 376 * should be the one m_f() holds.
@@ -499,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
499 */ 496 */
500static int me_swapcache_dirty(struct page *p, unsigned long pfn) 497static int me_swapcache_dirty(struct page *p, unsigned long pfn)
501{ 498{
502 int ret = FAILED;
503
504 ClearPageDirty(p); 499 ClearPageDirty(p);
505 /* Trigger EIO in shmem: */ 500 /* Trigger EIO in shmem: */
506 ClearPageUptodate(p); 501 ClearPageUptodate(p);
507 502
508 if (!isolate_lru_page(p)) { 503 return DELAYED;
509 page_cache_release(p);
510 ret = DELAYED;
511 }
512
513 return ret;
514} 504}
515 505
516static int me_swapcache_clean(struct page *p, unsigned long pfn) 506static int me_swapcache_clean(struct page *p, unsigned long pfn)
517{ 507{
518 int ret = FAILED;
519
520 if (!isolate_lru_page(p)) {
521 page_cache_release(p);
522 ret = RECOVERED;
523 }
524 delete_from_swap_cache(p); 508 delete_from_swap_cache(p);
525 return ret; 509
510 return RECOVERED;
526} 511}
527 512
528/* 513/*
@@ -612,8 +597,6 @@ static struct page_state {
612 { 0, 0, "unknown page state", me_unknown }, 597 { 0, 0, "unknown page state", me_unknown },
613}; 598};
614 599
615#undef lru
616
617static void action_result(unsigned long pfn, char *msg, int result) 600static void action_result(unsigned long pfn, char *msg, int result)
618{ 601{
619 struct page *page = NULL; 602 struct page *page = NULL;
@@ -630,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
630 unsigned long pfn, int ref) 613 unsigned long pfn, int ref)
631{ 614{
632 int result; 615 int result;
616 int count;
633 617
634 result = ps->action(p, pfn); 618 result = ps->action(p, pfn);
635 action_result(pfn, ps->msg, result); 619 action_result(pfn, ps->msg, result);
636 if (page_count(p) != 1 + ref) 620
621 count = page_count(p) - 1 - ref;
622 if (count != 0)
637 printk(KERN_ERR 623 printk(KERN_ERR
638 "MCE %#lx: %s page still referenced by %d users\n", 624 "MCE %#lx: %s page still referenced by %d users\n",
639 pfn, ps->msg, page_count(p) - 1); 625 pfn, ps->msg, count);
640 626
641 /* Could do more checks here if page looks ok */ 627 /* Could do more checks here if page looks ok */
642 /* 628 /*
@@ -665,9 +651,6 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
665 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) 651 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
666 return; 652 return;
667 653
668 if (!PageLRU(p))
669 lru_add_drain_all();
670
671 /* 654 /*
672 * This check implies we don't kill processes if their pages 655 * This check implies we don't kill processes if their pages
673 * are in the swap cache early. Those are always late kills. 656 * are in the swap cache early. Those are always late kills.
@@ -739,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
739 722
740int __memory_failure(unsigned long pfn, int trapno, int ref) 723int __memory_failure(unsigned long pfn, int trapno, int ref)
741{ 724{
725 unsigned long lru_flag;
742 struct page_state *ps; 726 struct page_state *ps;
743 struct page *p; 727 struct page *p;
744 int res; 728 int res;
@@ -776,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
776 } 760 }
777 761
778 /* 762 /*
763 * We ignore non-LRU pages for good reasons.
764 * - PG_locked is only well defined for LRU pages and a few others
765 * - to avoid races with __set_page_locked()
766 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
767 * The check (unnecessarily) ignores LRU pages being isolated and
768 * walked by the page reclaim code, however that's not a big loss.
769 */
770 if (!PageLRU(p))
771 lru_add_drain_all();
772 lru_flag = p->flags & lru;
773 if (isolate_lru_page(p)) {
774 action_result(pfn, "non LRU", IGNORED);
775 put_page(p);
776 return -EBUSY;
777 }
778 page_cache_release(p);
779
780 /*
779 * Lock the page and wait for writeback to finish. 781 * Lock the page and wait for writeback to finish.
780 * It's very difficult to mess with pages currently under IO 782 * It's very difficult to mess with pages currently under IO
781 * and in many cases impossible, so we just avoid it here. 783 * and in many cases impossible, so we just avoid it here.
@@ -791,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
791 /* 793 /*
792 * Torn down by someone else? 794 * Torn down by someone else?
793 */ 795 */
794 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 796 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
795 action_result(pfn, "already truncated LRU", IGNORED); 797 action_result(pfn, "already truncated LRU", IGNORED);
796 res = 0; 798 res = 0;
797 goto out; 799 goto out;
@@ -799,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
799 801
800 res = -EBUSY; 802 res = -EBUSY;
801 for (ps = error_states;; ps++) { 803 for (ps = error_states;; ps++) {
802 if ((p->flags & ps->mask) == ps->res) { 804 if (((p->flags | lru_flag)& ps->mask) == ps->res) {
803 res = page_action(ps, p, pfn, ref); 805 res = page_action(ps, p, pfn, ref);
804 break; 806 break;
805 } 807 }
diff --git a/mm/memory.c b/mm/memory.c
index 60ea601e03ea..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2542,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2542 } else if (PageHWPoison(page)) { 2542 } else if (PageHWPoison(page)) {
2543 ret = VM_FAULT_HWPOISON; 2543 ret = VM_FAULT_HWPOISON;
2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2545 goto out; 2545 goto out_release;
2546 } 2546 }
2547 2547
2548 lock_page(page); 2548 lock_page(page);
@@ -2614,6 +2614,7 @@ out_nomap:
2614 pte_unmap_unlock(page_table, ptl); 2614 pte_unmap_unlock(page_table, ptl);
2615out_page: 2615out_page:
2616 unlock_page(page); 2616 unlock_page(page);
2617out_release:
2617 page_cache_release(page); 2618 page_cache_release(page);
2618 return ret; 2619 return ret;
2619} 2620}