aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2017-03-27 05:33:02 -0400
committerIngo Molnar <mingo@kernel.org>2017-03-28 02:54:48 -0400
commit011d8261117249eab97bc86a8e1ac7731e03e319 (patch)
tree5e4a07f4ac44d81b62344ee3c8dadadf1f77cf66
parente64edfcce9c738300b4102d0739577d6ecc96d4a (diff)
RAS: Add a Corrected Errors Collector
Introduce a simple data structure for collecting correctable errors along with accessors. More detailed description in the code itself. The error decoding is done with the decoding chain now and mce_first_notifier() gets to see the error first and the CEC decides whether to log it and then the rest of the chain doesn't hear about it - basically the main reason for the CE collector - or to continue running the notifiers. When the CEC hits the action threshold, it will try to soft-offine the page containing the ECC and then the whole decoding chain gets to see the error. Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20170327093304.10683-5-bp@alien8.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt6
-rw-r--r--arch/x86/include/asm/mce.h9
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c191
-rw-r--r--arch/x86/ras/Kconfig14
-rw-r--r--drivers/ras/Makefile3
-rw-r--r--drivers/ras/cec.c532
-rw-r--r--drivers/ras/debugfs.c2
-rw-r--r--drivers/ras/debugfs.h8
-rw-r--r--drivers/ras/ras.c11
-rw-r--r--include/linux/ras.h13
10 files changed, 706 insertions, 83 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2ba45caabada..927b8b8854cd 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3172,6 +3172,12 @@
3172 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes 3172 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
3173 See Documentation/blockdev/ramdisk.txt. 3173 See Documentation/blockdev/ramdisk.txt.
3174 3174
3175 ras=option[,option,...] [KNL] RAS-specific options
3176
3177 cec_disable [X86]
3178 Disable the Correctable Errors Collector,
3179 see CONFIG_RAS_CEC help text.
3180
3175 rcu_nocbs= [KNL] 3181 rcu_nocbs= [KNL]
3176 The argument is a cpu list, as described above. 3182 The argument is a cpu list, as described above.
3177 3183
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 0512dcc11750..c5ae545d27d8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
191extern struct mca_msr_regs msr_ops; 191extern struct mca_msr_regs msr_ops;
192 192
193enum mce_notifier_prios { 193enum mce_notifier_prios {
194 MCE_PRIO_SRAO = INT_MAX, 194 MCE_PRIO_FIRST = INT_MAX,
195 MCE_PRIO_EXTLOG = INT_MAX - 1, 195 MCE_PRIO_SRAO = INT_MAX - 1,
196 MCE_PRIO_NFIT = INT_MAX - 2, 196 MCE_PRIO_EXTLOG = INT_MAX - 2,
197 MCE_PRIO_EDAC = INT_MAX - 3, 197 MCE_PRIO_NFIT = INT_MAX - 3,
198 MCE_PRIO_EDAC = INT_MAX - 4,
198 MCE_PRIO_LOWEST = 0, 199 MCE_PRIO_LOWEST = 0,
199}; 200};
200 201
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8ada093d4e40..4a907758a516 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -35,6 +35,7 @@
35#include <linux/poll.h> 35#include <linux/poll.h>
36#include <linux/nmi.h> 36#include <linux/nmi.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/ras.h>
38#include <linux/smp.h> 39#include <linux/smp.h>
39#include <linux/fs.h> 40#include <linux/fs.h>
40#include <linux/mm.h> 41#include <linux/mm.h>
@@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {
160 161
161void mce_log(struct mce *m) 162void mce_log(struct mce *m)
162{ 163{
163 unsigned next, entry;
164
165 /* Emit the trace record: */
166 trace_mce_record(m);
167
168 if (!mce_gen_pool_add(m)) 164 if (!mce_gen_pool_add(m))
169 irq_work_queue(&mce_irq_work); 165 irq_work_queue(&mce_irq_work);
170
171 wmb();
172 for (;;) {
173 entry = mce_log_get_idx_check(mcelog_buf.next);
174 for (;;) {
175
176 /*
177 * When the buffer fills up discard new entries.
178 * Assume that the earlier errors are the more
179 * interesting ones:
180 */
181 if (entry >= MCE_LOG_LEN) {
182 set_bit(MCE_OVERFLOW,
183 (unsigned long *)&mcelog_buf.flags);
184 return;
185 }
186 /* Old left over entry. Skip: */
187 if (mcelog_buf.entry[entry].finished) {
188 entry++;
189 continue;
190 }
191 break;
192 }
193 smp_rmb();
194 next = entry + 1;
195 if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
196 break;
197 }
198 memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
199 wmb();
200 mcelog_buf.entry[entry].finished = 1;
201 wmb();
202
203 set_bit(0, &mce_need_notify);
204} 166}
205 167
206void mce_inject_log(struct mce *m) 168void mce_inject_log(struct mce *m)
@@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
213 175
214static struct notifier_block mce_srao_nb; 176static struct notifier_block mce_srao_nb;
215 177
178/*
179 * We run the default notifier if we have only the SRAO, the first and the
180 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
181 * notifiers registered on the chain.
182 */
183#define NUM_DEFAULT_NOTIFIERS 3
216static atomic_t num_notifiers; 184static atomic_t num_notifiers;
217 185
218void mce_register_decode_chain(struct notifier_block *nb) 186void mce_register_decode_chain(struct notifier_block *nb)
@@ -522,7 +490,6 @@ static void mce_schedule_work(void)
522 490
523static void mce_irq_work_cb(struct irq_work *entry) 491static void mce_irq_work_cb(struct irq_work *entry)
524{ 492{
525 mce_notify_irq();
526 mce_schedule_work(); 493 mce_schedule_work();
527} 494}
528 495
@@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
565 return 1; 532 return 1;
566} 533}
567 534
535static bool memory_error(struct mce *m)
536{
537 struct cpuinfo_x86 *c = &boot_cpu_data;
538
539 if (c->x86_vendor == X86_VENDOR_AMD) {
540 /* ErrCodeExt[20:16] */
541 u8 xec = (m->status >> 16) & 0x1f;
542
543 return (xec == 0x0 || xec == 0x8);
544 } else if (c->x86_vendor == X86_VENDOR_INTEL) {
545 /*
546 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
547 *
548 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
549 * indicating a memory error. Bit 8 is used for indicating a
550 * cache hierarchy error. The combination of bit 2 and bit 3
551 * is used for indicating a `generic' cache hierarchy error
552 * But we can't just blindly check the above bits, because if
553 * bit 11 is set, then it is a bus/interconnect error - and
554 * either way the above bits just gives more detail on what
555 * bus/interconnect error happened. Note that bit 12 can be
556 * ignored, as it's the "filter" bit.
557 */
558 return (m->status & 0xef80) == BIT(7) ||
559 (m->status & 0xef00) == BIT(8) ||
560 (m->status & 0xeffc) == 0xc;
561 }
562
563 return false;
564}
565
566static bool cec_add_mce(struct mce *m)
567{
568 if (!m)
569 return false;
570
571 /* We eat only correctable DRAM errors with usable addresses. */
572 if (memory_error(m) &&
573 !(m->status & MCI_STATUS_UC) &&
574 mce_usable_address(m))
575 if (!cec_add_elem(m->addr >> PAGE_SHIFT))
576 return true;
577
578 return false;
579}
580
581static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
582 void *data)
583{
584 struct mce *m = (struct mce *)data;
585 unsigned int next, entry;
586
587 if (!m)
588 return NOTIFY_DONE;
589
590 if (cec_add_mce(m))
591 return NOTIFY_STOP;
592
593 /* Emit the trace record: */
594 trace_mce_record(m);
595
596 wmb();
597 for (;;) {
598 entry = mce_log_get_idx_check(mcelog_buf.next);
599 for (;;) {
600
601 /*
602 * When the buffer fills up discard new entries.
603 * Assume that the earlier errors are the more
604 * interesting ones:
605 */
606 if (entry >= MCE_LOG_LEN) {
607 set_bit(MCE_OVERFLOW,
608 (unsigned long *)&mcelog_buf.flags);
609 return NOTIFY_DONE;
610 }
611 /* Old left over entry. Skip: */
612 if (mcelog_buf.entry[entry].finished) {
613 entry++;
614 continue;
615 }
616 break;
617 }
618 smp_rmb();
619 next = entry + 1;
620 if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
621 break;
622 }
623 memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
624 wmb();
625 mcelog_buf.entry[entry].finished = 1;
626 wmb();
627
628 set_bit(0, &mce_need_notify);
629
630 mce_notify_irq();
631
632 return NOTIFY_DONE;
633}
634
635static struct notifier_block first_nb = {
636 .notifier_call = mce_first_notifier,
637 .priority = MCE_PRIO_FIRST,
638};
639
568static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, 640static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
569 void *data) 641 void *data)
570{ 642{
@@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
594 if (!m) 666 if (!m)
595 return NOTIFY_DONE; 667 return NOTIFY_DONE;
596 668
597 /* 669 if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
598 * Run the default notifier if we have only the SRAO
599 * notifier and us registered.
600 */
601 if (atomic_read(&num_notifiers) > 2)
602 return NOTIFY_DONE; 670 return NOTIFY_DONE;
603 671
604 /* Don't print when mcelog is running */ 672 /* Don't print when mcelog is running */
@@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
655 } 723 }
656} 724}
657 725
658static bool memory_error(struct mce *m)
659{
660 struct cpuinfo_x86 *c = &boot_cpu_data;
661
662 if (c->x86_vendor == X86_VENDOR_AMD) {
663 /* ErrCodeExt[20:16] */
664 u8 xec = (m->status >> 16) & 0x1f;
665
666 return (xec == 0x0 || xec == 0x8);
667 } else if (c->x86_vendor == X86_VENDOR_INTEL) {
668 /*
669 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
670 *
671 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
672 * indicating a memory error. Bit 8 is used for indicating a
673 * cache hierarchy error. The combination of bit 2 and bit 3
674 * is used for indicating a `generic' cache hierarchy error
675 * But we can't just blindly check the above bits, because if
676 * bit 11 is set, then it is a bus/interconnect error - and
677 * either way the above bits just gives more detail on what
678 * bus/interconnect error happened. Note that bit 12 can be
679 * ignored, as it's the "filter" bit.
680 */
681 return (m->status & 0xef80) == BIT(7) ||
682 (m->status & 0xef00) == BIT(8) ||
683 (m->status & 0xeffc) == 0xc;
684 }
685
686 return false;
687}
688
689DEFINE_PER_CPU(unsigned, mce_poll_count); 726DEFINE_PER_CPU(unsigned, mce_poll_count);
690 727
691/* 728/*
@@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
2167int __init mcheck_init(void) 2204int __init mcheck_init(void)
2168{ 2205{
2169 mcheck_intel_therm_init(); 2206 mcheck_intel_therm_init();
2207 mce_register_decode_chain(&first_nb);
2170 mce_register_decode_chain(&mce_srao_nb); 2208 mce_register_decode_chain(&mce_srao_nb);
2171 mce_register_decode_chain(&mce_default_nb); 2209 mce_register_decode_chain(&mce_default_nb);
2172 mcheck_vendor_init_severity(); 2210 mcheck_vendor_init_severity();
@@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
2716 static_branch_inc(&mcsafe_key); 2754 static_branch_inc(&mcsafe_key);
2717 2755
2718 mcheck_debugfs_init(); 2756 mcheck_debugfs_init();
2757 cec_init();
2719 2758
2720 /* 2759 /*
2721 * Flush out everything that has been logged during early boot, now that 2760 * Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 0bc60a308730..2a2d89d39af6 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
7 aspects of the MCE handling code. 7 aspects of the MCE handling code.
8 8
9 WARNING: Do not even assume this interface is staying stable! 9 WARNING: Do not even assume this interface is staying stable!
10
11config RAS_CEC
12 bool "Correctable Errors Collector"
13 depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
14 ---help---
15 This is a small cache which collects correctable memory errors per 4K
16 page PFN and counts their repeated occurrence. Once the counter for a
17 PFN overflows, we try to soft-offline that page as we take it to mean
18 that it has reached a relatively high error count and would probably
19 be best if we don't use it anymore.
20
21 Bear in mind that this is absolutely useless if your platform doesn't
22 have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
23
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile
index d7f73341ced3..7b26dd3aa5d0 100644
--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
@@ -1 +1,2 @@
1obj-$(CONFIG_RAS) += ras.o debugfs.o 1obj-$(CONFIG_RAS) += ras.o debugfs.o
2obj-$(CONFIG_RAS_CEC) += cec.o
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
new file mode 100644
index 000000000000..6aab46d91d33
--- /dev/null
+++ b/drivers/ras/cec.c
@@ -0,0 +1,532 @@
1#include <linux/mm.h>
2#include <linux/gfp.h>
3#include <linux/kernel.h>
4
5#include <asm/mce.h>
6
7#include "debugfs.h"
8
9/*
10 * RAS Correctable Errors Collector
11 *
12 * This is a simple gadget which collects correctable errors and counts their
13 * occurrence per physical page address.
14 *
15 * We've opted for possibly the simplest data structure to collect those - an
16 * array of the size of a memory page. It stores 512 u64's with the following
17 * structure:
18 *
19 * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
20 *
21 * The generation in the two highest order bits is two bits which are set to 11b
22 * on every insertion. During the course of each entry's existence, the
23 * generation field gets decremented during spring cleaning to 10b, then 01b and
24 * then 00b.
25 *
26 * This way we're employing the natural numeric ordering to make sure that newly
27 * inserted/touched elements have higher 12-bit counts (which we've manufactured)
28 * and thus iterating over the array initially won't kick out those elements
29 * which were inserted last.
30 *
31 * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
32 * elements entered into the array, during which, we're decaying all elements.
33 * If, after decay, an element gets inserted again, its generation is set to 11b
34 * to make sure it has higher numerical count than other, older elements and
35 * thus emulate an an LRU-like behavior when deleting elements to free up space
36 * in the page.
37 *
38 * When an element reaches it's max count of count_threshold, we try to poison
39 * it by assuming that errors triggered count_threshold times in a single page
40 * are excessive and that page shouldn't be used anymore. count_threshold is
41 * initialized to COUNT_MASK which is the maximum.
42 *
43 * That error event entry causes cec_add_elem() to return !0 value and thus
44 * signal to its callers to log the error.
45 *
46 * To the question why we've chosen a page and moving elements around with
47 * memmove(), it is because it is a very simple structure to handle and max data
48 * movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
49 * We wanted to avoid the pointer traversal of more complex structures like a
50 * linked list or some sort of a balancing search tree.
51 *
52 * Deleting an element takes O(n) but since it is only a single page, it should
53 * be fast enough and it shouldn't happen all too often depending on error
54 * patterns.
55 */
56
57#undef pr_fmt
58#define pr_fmt(fmt) "RAS: " fmt
59
60/*
61 * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
62 * elements have stayed in the array without having been accessed again.
63 */
64#define DECAY_BITS 2
65#define DECAY_MASK ((1ULL << DECAY_BITS) - 1)
66#define MAX_ELEMS (PAGE_SIZE / sizeof(u64))
67
68/*
69 * Threshold amount of inserted elements after which we start spring
70 * cleaning.
71 */
72#define CLEAN_ELEMS (MAX_ELEMS >> DECAY_BITS)
73
74/* Bits which count the number of errors happened in this 4K page. */
75#define COUNT_BITS (PAGE_SHIFT - DECAY_BITS)
76#define COUNT_MASK ((1ULL << COUNT_BITS) - 1)
77#define FULL_COUNT_MASK (PAGE_SIZE - 1)
78
79/*
80 * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
81 */
82
83#define PFN(e) ((e) >> PAGE_SHIFT)
84#define DECAY(e) (((e) >> COUNT_BITS) & DECAY_MASK)
85#define COUNT(e) ((unsigned int)(e) & COUNT_MASK)
86#define FULL_COUNT(e) ((e) & (PAGE_SIZE - 1))
87
88static struct ce_array {
89 u64 *array; /* container page */
90 unsigned int n; /* number of elements in the array */
91
92 unsigned int decay_count; /*
93 * number of element insertions/increments
94 * since the last spring cleaning.
95 */
96
97 u64 pfns_poisoned; /*
98 * number of PFNs which got poisoned.
99 */
100
101 u64 ces_entered; /*
102 * The number of correctable errors
103 * entered into the collector.
104 */
105
106 u64 decays_done; /*
107 * Times we did spring cleaning.
108 */
109
110 union {
111 struct {
112 __u32 disabled : 1, /* cmdline disabled */
113 __resv : 31;
114 };
115 __u32 flags;
116 };
117} ce_arr;
118
119static DEFINE_MUTEX(ce_mutex);
120static u64 dfs_pfn;
121
122/* Amount of errors after which we offline */
123static unsigned int count_threshold = COUNT_MASK;
124
125/*
126 * The timer "decays" element count each timer_interval which is 24hrs by
127 * default.
128 */
129
130#define CEC_TIMER_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
131#define CEC_TIMER_MIN_INTERVAL 1 * 60 * 60 /* 1h */
132#define CEC_TIMER_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */
133static struct timer_list cec_timer;
134static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
135
136/*
137 * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
138 * element in the array. On insertion and any access, it gets reset to max.
139 */
140static void do_spring_cleaning(struct ce_array *ca)
141{
142 int i;
143
144 for (i = 0; i < ca->n; i++) {
145 u8 decay = DECAY(ca->array[i]);
146
147 if (!decay)
148 continue;
149
150 decay--;
151
152 ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
153 ca->array[i] |= (decay << COUNT_BITS);
154 }
155 ca->decay_count = 0;
156 ca->decays_done++;
157}
158
159/*
160 * @interval in seconds
161 */
162static void cec_mod_timer(struct timer_list *t, unsigned long interval)
163{
164 unsigned long iv;
165
166 iv = interval * HZ + jiffies;
167
168 mod_timer(t, round_jiffies(iv));
169}
170
171static void cec_timer_fn(unsigned long data)
172{
173 struct ce_array *ca = (struct ce_array *)data;
174
175 do_spring_cleaning(ca);
176
177 cec_mod_timer(&cec_timer, timer_interval);
178}
179
180/*
181 * @to: index of the smallest element which is >= then @pfn.
182 *
183 * Return the index of the pfn if found, otherwise negative value.
184 */
185static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
186{
187 u64 this_pfn;
188 int min = 0, max = ca->n;
189
190 while (min < max) {
191 int tmp = (max + min) >> 1;
192
193 this_pfn = PFN(ca->array[tmp]);
194
195 if (this_pfn < pfn)
196 min = tmp + 1;
197 else if (this_pfn > pfn)
198 max = tmp;
199 else {
200 min = tmp;
201 break;
202 }
203 }
204
205 if (to)
206 *to = min;
207
208 this_pfn = PFN(ca->array[min]);
209
210 if (this_pfn == pfn)
211 return min;
212
213 return -ENOKEY;
214}
215
216static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
217{
218 WARN_ON(!to);
219
220 if (!ca->n) {
221 *to = 0;
222 return -ENOKEY;
223 }
224 return __find_elem(ca, pfn, to);
225}
226
227static void del_elem(struct ce_array *ca, int idx)
228{
229 /* Save us a function call when deleting the last element. */
230 if (ca->n - (idx + 1))
231 memmove((void *)&ca->array[idx],
232 (void *)&ca->array[idx + 1],
233 (ca->n - (idx + 1)) * sizeof(u64));
234
235 ca->n--;
236}
237
238static u64 del_lru_elem_unlocked(struct ce_array *ca)
239{
240 unsigned int min = FULL_COUNT_MASK;
241 int i, min_idx = 0;
242
243 for (i = 0; i < ca->n; i++) {
244 unsigned int this = FULL_COUNT(ca->array[i]);
245
246 if (min > this) {
247 min = this;
248 min_idx = i;
249 }
250 }
251
252 del_elem(ca, min_idx);
253
254 return PFN(ca->array[min_idx]);
255}
256
257/*
258 * We return the 0th pfn in the error case under the assumption that it cannot
259 * be poisoned and excessive CEs in there are a serious deal anyway.
260 */
261static u64 __maybe_unused del_lru_elem(void)
262{
263 struct ce_array *ca = &ce_arr;
264 u64 pfn;
265
266 if (!ca->n)
267 return 0;
268
269 mutex_lock(&ce_mutex);
270 pfn = del_lru_elem_unlocked(ca);
271 mutex_unlock(&ce_mutex);
272
273 return pfn;
274}
275
276
277int cec_add_elem(u64 pfn)
278{
279 struct ce_array *ca = &ce_arr;
280 unsigned int to;
281 int count, ret = 0;
282
283 /*
284 * We can be called very early on the identify_cpu() path where we are
285 * not initialized yet. We ignore the error for simplicity.
286 */
287 if (!ce_arr.array || ce_arr.disabled)
288 return -ENODEV;
289
290 ca->ces_entered++;
291
292 mutex_lock(&ce_mutex);
293
294 if (ca->n == MAX_ELEMS)
295 WARN_ON(!del_lru_elem_unlocked(ca));
296
297 ret = find_elem(ca, pfn, &to);
298 if (ret < 0) {
299 /*
300 * Shift range [to-end] to make room for one more element.
301 */
302 memmove((void *)&ca->array[to + 1],
303 (void *)&ca->array[to],
304 (ca->n - to) * sizeof(u64));
305
306 ca->array[to] = (pfn << PAGE_SHIFT) |
307 (DECAY_MASK << COUNT_BITS) | 1;
308
309 ca->n++;
310
311 ret = 0;
312
313 goto decay;
314 }
315
316 count = COUNT(ca->array[to]);
317
318 if (count < count_threshold) {
319 ca->array[to] |= (DECAY_MASK << COUNT_BITS);
320 ca->array[to]++;
321
322 ret = 0;
323 } else {
324 u64 pfn = ca->array[to] >> PAGE_SHIFT;
325
326 if (!pfn_valid(pfn)) {
327 pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
328 } else {
329 /* We have reached max count for this page, soft-offline it. */
330 pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
331 memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
332 ca->pfns_poisoned++;
333 }
334
335 del_elem(ca, to);
336
337 /*
338 * Return a >0 value to denote that we've reached the offlining
339 * threshold.
340 */
341 ret = 1;
342
343 goto unlock;
344 }
345
346decay:
347 ca->decay_count++;
348
349 if (ca->decay_count >= CLEAN_ELEMS)
350 do_spring_cleaning(ca);
351
352unlock:
353 mutex_unlock(&ce_mutex);
354
355 return ret;
356}
357
358static int u64_get(void *data, u64 *val)
359{
360 *val = *(u64 *)data;
361
362 return 0;
363}
364
365static int pfn_set(void *data, u64 val)
366{
367 *(u64 *)data = val;
368
369 return cec_add_elem(val);
370}
371
372DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
373
374static int decay_interval_set(void *data, u64 val)
375{
376 *(u64 *)data = val;
377
378 if (val < CEC_TIMER_MIN_INTERVAL)
379 return -EINVAL;
380
381 if (val > CEC_TIMER_MAX_INTERVAL)
382 return -EINVAL;
383
384 timer_interval = val;
385
386 cec_mod_timer(&cec_timer, timer_interval);
387 return 0;
388}
389DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
390
391static int count_threshold_set(void *data, u64 val)
392{
393 *(u64 *)data = val;
394
395 if (val > COUNT_MASK)
396 val = COUNT_MASK;
397
398 count_threshold = val;
399
400 return 0;
401}
402DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
403
404static int array_dump(struct seq_file *m, void *v)
405{
406 struct ce_array *ca = &ce_arr;
407 u64 prev = 0;
408 int i;
409
410 mutex_lock(&ce_mutex);
411
412 seq_printf(m, "{ n: %d\n", ca->n);
413 for (i = 0; i < ca->n; i++) {
414 u64 this = PFN(ca->array[i]);
415
416 seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
417
418 WARN_ON(prev > this);
419
420 prev = this;
421 }
422
423 seq_printf(m, "}\n");
424
425 seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
426 ca->ces_entered, ca->pfns_poisoned);
427
428 seq_printf(m, "Flags: 0x%x\n", ca->flags);
429
430 seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
431 seq_printf(m, "Decays: %lld\n", ca->decays_done);
432
433 seq_printf(m, "Action threshold: %d\n", count_threshold);
434
435 mutex_unlock(&ce_mutex);
436
437 return 0;
438}
439
440static int array_open(struct inode *inode, struct file *filp)
441{
442 return single_open(filp, array_dump, NULL);
443}
444
445static const struct file_operations array_ops = {
446 .owner = THIS_MODULE,
447 .open = array_open,
448 .read = seq_read,
449 .llseek = seq_lseek,
450 .release = single_release,
451};
452
453static int __init create_debugfs_nodes(void)
454{
455 struct dentry *d, *pfn, *decay, *count, *array;
456
457 d = debugfs_create_dir("cec", ras_debugfs_dir);
458 if (!d) {
459 pr_warn("Error creating cec debugfs node!\n");
460 return -1;
461 }
462
463 pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
464 if (!pfn) {
465 pr_warn("Error creating pfn debugfs node!\n");
466 goto err;
467 }
468
469 array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
470 if (!array) {
471 pr_warn("Error creating array debugfs node!\n");
472 goto err;
473 }
474
475 decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
476 &timer_interval, &decay_interval_ops);
477 if (!decay) {
478 pr_warn("Error creating decay_interval debugfs node!\n");
479 goto err;
480 }
481
482 count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
483 &count_threshold, &count_threshold_ops);
484 if (!decay) {
485 pr_warn("Error creating count_threshold debugfs node!\n");
486 goto err;
487 }
488
489
490 return 0;
491
492err:
493 debugfs_remove_recursive(d);
494
495 return 1;
496}
497
498void __init cec_init(void)
499{
500 if (ce_arr.disabled)
501 return;
502
503 ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
504 if (!ce_arr.array) {
505 pr_err("Error allocating CE array page!\n");
506 return;
507 }
508
509 if (create_debugfs_nodes())
510 return;
511
512 setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
513 cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
514
515 pr_info("Correctable Errors collector initialized.\n");
516}
517
518int __init parse_cec_param(char *str)
519{
520 if (!str)
521 return 0;
522
523 if (*str == '=')
524 str++;
525
526 if (!strncmp(str, "cec_disable", 7))
527 ce_arr.disabled = 1;
528 else
529 return 0;
530
531 return 1;
532}
diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c
index 0322acf67ea5..501603057dff 100644
--- a/drivers/ras/debugfs.c
+++ b/drivers/ras/debugfs.c
@@ -1,6 +1,6 @@
1#include <linux/debugfs.h> 1#include <linux/debugfs.h>
2 2
3static struct dentry *ras_debugfs_dir; 3struct dentry *ras_debugfs_dir;
4 4
5static atomic_t trace_count = ATOMIC_INIT(0); 5static atomic_t trace_count = ATOMIC_INIT(0);
6 6
diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h
new file mode 100644
index 000000000000..db72e4513191
--- /dev/null
+++ b/drivers/ras/debugfs.h
@@ -0,0 +1,8 @@
1#ifndef __RAS_DEBUGFS_H__
2#define __RAS_DEBUGFS_H__
3
4#include <linux/debugfs.h>
5
6extern struct dentry *ras_debugfs_dir;
7
8#endif /* __RAS_DEBUGFS_H__ */
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd362b7b6..94f8038864b4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
27EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); 27EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
28#endif 28#endif
29EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); 29EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
30
31
32int __init parse_ras_param(char *str)
33{
34#ifdef CONFIG_RAS_CEC
35 parse_cec_param(str);
36#endif
37
38 return 1;
39}
40__setup("ras", parse_ras_param);
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 2aceeafd6fe5..ffb147185e8d 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -1,14 +1,25 @@
1#ifndef __RAS_H__ 1#ifndef __RAS_H__
2#define __RAS_H__ 2#define __RAS_H__
3 3
4#include <asm/errno.h>
5
4#ifdef CONFIG_DEBUG_FS 6#ifdef CONFIG_DEBUG_FS
5int ras_userspace_consumers(void); 7int ras_userspace_consumers(void);
6void ras_debugfs_init(void); 8void ras_debugfs_init(void);
7int ras_add_daemon_trace(void); 9int ras_add_daemon_trace(void);
8#else 10#else
9static inline int ras_userspace_consumers(void) { return 0; } 11static inline int ras_userspace_consumers(void) { return 0; }
10static inline void ras_debugfs_init(void) { return; } 12static inline void ras_debugfs_init(void) { }
11static inline int ras_add_daemon_trace(void) { return 0; } 13static inline int ras_add_daemon_trace(void) { return 0; }
12#endif 14#endif
13 15
16#ifdef CONFIG_RAS_CEC
17void __init cec_init(void);
18int __init parse_cec_param(char *str);
19int cec_add_elem(u64 pfn);
20#else
21static inline void __init cec_init(void) { }
22static inline int cec_add_elem(u64 pfn) { return -ENODEV; }
14#endif 23#endif
24
25#endif /* __RAS_H__ */