diff options
Diffstat (limited to 'drivers/edac/edac_mc.h')
-rw-r--r-- | drivers/edac/edac_mc.h | 448 |
1 files changed, 448 insertions, 0 deletions
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h new file mode 100644 index 000000000000..75ecf484a43a --- /dev/null +++ b/drivers/edac/edac_mc.h | |||
@@ -0,0 +1,448 @@ | |||
1 | /* | ||
2 | * MC kernel module | ||
3 | * (C) 2003 Linux Networx (http://lnxi.com) | ||
4 | * This file may be distributed under the terms of the | ||
5 | * GNU General Public License. | ||
6 | * | ||
7 | * Written by Thayne Harbaugh | ||
8 | * Based on work by Dan Hollis <goemon at anime dot net> and others. | ||
9 | * http://www.anime.net/~goemon/linux-ecc/ | ||
10 | * | ||
11 | * NMI handling support added by | ||
12 | * Dave Peterson <dsp@llnl.gov> <dave_peterson@pobox.com> | ||
13 | * | ||
14 | * $Id: edac_mc.h,v 1.4.2.10 2005/10/05 00:43:44 dsp_llnl Exp $ | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | |||
19 | #ifndef _EDAC_MC_H_ | ||
20 | #define _EDAC_MC_H_ | ||
21 | |||
22 | |||
23 | #include <linux/config.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/types.h> | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/spinlock.h> | ||
28 | #include <linux/smp.h> | ||
29 | #include <linux/pci.h> | ||
30 | #include <linux/time.h> | ||
31 | #include <linux/nmi.h> | ||
32 | #include <linux/rcupdate.h> | ||
33 | #include <linux/completion.h> | ||
34 | #include <linux/kobject.h> | ||
35 | |||
36 | |||
37 | #define EDAC_MC_LABEL_LEN 31 | ||
38 | #define MC_PROC_NAME_MAX_LEN 7 | ||
39 | |||
40 | #if PAGE_SHIFT < 20 | ||
41 | #define PAGES_TO_MiB( pages ) ( ( pages ) >> ( 20 - PAGE_SHIFT ) ) | ||
42 | #else /* PAGE_SHIFT > 20 */ | ||
43 | #define PAGES_TO_MiB( pages ) ( ( pages ) << ( PAGE_SHIFT - 20 ) ) | ||
44 | #endif | ||
45 | |||
46 | #ifdef CONFIG_EDAC_DEBUG | ||
47 | extern int edac_debug_level; | ||
48 | #define edac_debug_printk(level, fmt, args...) \ | ||
49 | do { if (level <= edac_debug_level) printk(KERN_DEBUG fmt, ##args); } while(0) | ||
50 | #define debugf0( ... ) edac_debug_printk(0, __VA_ARGS__ ) | ||
51 | #define debugf1( ... ) edac_debug_printk(1, __VA_ARGS__ ) | ||
52 | #define debugf2( ... ) edac_debug_printk(2, __VA_ARGS__ ) | ||
53 | #define debugf3( ... ) edac_debug_printk(3, __VA_ARGS__ ) | ||
54 | #define debugf4( ... ) edac_debug_printk(4, __VA_ARGS__ ) | ||
55 | #else /* !CONFIG_EDAC_DEBUG */ | ||
56 | #define debugf0( ... ) | ||
57 | #define debugf1( ... ) | ||
58 | #define debugf2( ... ) | ||
59 | #define debugf3( ... ) | ||
60 | #define debugf4( ... ) | ||
61 | #endif /* !CONFIG_EDAC_DEBUG */ | ||
62 | |||
63 | |||
64 | #define bs_xstr(s) bs_str(s) | ||
65 | #define bs_str(s) #s | ||
66 | #define BS_MOD_STR bs_xstr(KBUILD_BASENAME) | ||
67 | |||
68 | #define BIT(x) (1 << (x)) | ||
69 | |||
70 | #define PCI_VEND_DEV(vend, dev) PCI_VENDOR_ID_ ## vend, PCI_DEVICE_ID_ ## vend ## _ ## dev | ||
71 | |||
72 | /* memory devices */ | ||
73 | enum dev_type { | ||
74 | DEV_UNKNOWN = 0, | ||
75 | DEV_X1, | ||
76 | DEV_X2, | ||
77 | DEV_X4, | ||
78 | DEV_X8, | ||
79 | DEV_X16, | ||
80 | DEV_X32, /* Do these parts exist? */ | ||
81 | DEV_X64 /* Do these parts exist? */ | ||
82 | }; | ||
83 | |||
84 | #define DEV_FLAG_UNKNOWN BIT(DEV_UNKNOWN) | ||
85 | #define DEV_FLAG_X1 BIT(DEV_X1) | ||
86 | #define DEV_FLAG_X2 BIT(DEV_X2) | ||
87 | #define DEV_FLAG_X4 BIT(DEV_X4) | ||
88 | #define DEV_FLAG_X8 BIT(DEV_X8) | ||
89 | #define DEV_FLAG_X16 BIT(DEV_X16) | ||
90 | #define DEV_FLAG_X32 BIT(DEV_X32) | ||
91 | #define DEV_FLAG_X64 BIT(DEV_X64) | ||
92 | |||
93 | /* memory types */ | ||
94 | enum mem_type { | ||
95 | MEM_EMPTY = 0, /* Empty csrow */ | ||
96 | MEM_RESERVED, /* Reserved csrow type */ | ||
97 | MEM_UNKNOWN, /* Unknown csrow type */ | ||
98 | MEM_FPM, /* Fast page mode */ | ||
99 | MEM_EDO, /* Extended data out */ | ||
100 | MEM_BEDO, /* Burst Extended data out */ | ||
101 | MEM_SDR, /* Single data rate SDRAM */ | ||
102 | MEM_RDR, /* Registered single data rate SDRAM */ | ||
103 | MEM_DDR, /* Double data rate SDRAM */ | ||
104 | MEM_RDDR, /* Registered Double data rate SDRAM */ | ||
105 | MEM_RMBS /* Rambus DRAM */ | ||
106 | }; | ||
107 | |||
108 | #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) | ||
109 | #define MEM_FLAG_RESERVED BIT(MEM_RESERVED) | ||
110 | #define MEM_FLAG_UNKNOWN BIT(MEM_UNKNOWN) | ||
111 | #define MEM_FLAG_FPM BIT(MEM_FPM) | ||
112 | #define MEM_FLAG_EDO BIT(MEM_EDO) | ||
113 | #define MEM_FLAG_BEDO BIT(MEM_BEDO) | ||
114 | #define MEM_FLAG_SDR BIT(MEM_SDR) | ||
115 | #define MEM_FLAG_RDR BIT(MEM_RDR) | ||
116 | #define MEM_FLAG_DDR BIT(MEM_DDR) | ||
117 | #define MEM_FLAG_RDDR BIT(MEM_RDDR) | ||
118 | #define MEM_FLAG_RMBS BIT(MEM_RMBS) | ||
119 | |||
120 | |||
121 | /* chipset Error Detection and Correction capabilities and mode */ | ||
122 | enum edac_type { | ||
123 | EDAC_UNKNOWN = 0, /* Unknown if ECC is available */ | ||
124 | EDAC_NONE, /* Doesnt support ECC */ | ||
125 | EDAC_RESERVED, /* Reserved ECC type */ | ||
126 | EDAC_PARITY, /* Detects parity errors */ | ||
127 | EDAC_EC, /* Error Checking - no correction */ | ||
128 | EDAC_SECDED, /* Single bit error correction, Double detection */ | ||
129 | EDAC_S2ECD2ED, /* Chipkill x2 devices - do these exist? */ | ||
130 | EDAC_S4ECD4ED, /* Chipkill x4 devices */ | ||
131 | EDAC_S8ECD8ED, /* Chipkill x8 devices */ | ||
132 | EDAC_S16ECD16ED, /* Chipkill x16 devices */ | ||
133 | }; | ||
134 | |||
135 | #define EDAC_FLAG_UNKNOWN BIT(EDAC_UNKNOWN) | ||
136 | #define EDAC_FLAG_NONE BIT(EDAC_NONE) | ||
137 | #define EDAC_FLAG_PARITY BIT(EDAC_PARITY) | ||
138 | #define EDAC_FLAG_EC BIT(EDAC_EC) | ||
139 | #define EDAC_FLAG_SECDED BIT(EDAC_SECDED) | ||
140 | #define EDAC_FLAG_S2ECD2ED BIT(EDAC_S2ECD2ED) | ||
141 | #define EDAC_FLAG_S4ECD4ED BIT(EDAC_S4ECD4ED) | ||
142 | #define EDAC_FLAG_S8ECD8ED BIT(EDAC_S8ECD8ED) | ||
143 | #define EDAC_FLAG_S16ECD16ED BIT(EDAC_S16ECD16ED) | ||
144 | |||
145 | |||
146 | /* scrubbing capabilities */ | ||
147 | enum scrub_type { | ||
148 | SCRUB_UNKNOWN = 0, /* Unknown if scrubber is available */ | ||
149 | SCRUB_NONE, /* No scrubber */ | ||
150 | SCRUB_SW_PROG, /* SW progressive (sequential) scrubbing */ | ||
151 | SCRUB_SW_SRC, /* Software scrub only errors */ | ||
152 | SCRUB_SW_PROG_SRC, /* Progressive software scrub from an error */ | ||
153 | SCRUB_SW_TUNABLE, /* Software scrub frequency is tunable */ | ||
154 | SCRUB_HW_PROG, /* HW progressive (sequential) scrubbing */ | ||
155 | SCRUB_HW_SRC, /* Hardware scrub only errors */ | ||
156 | SCRUB_HW_PROG_SRC, /* Progressive hardware scrub from an error */ | ||
157 | SCRUB_HW_TUNABLE /* Hardware scrub frequency is tunable */ | ||
158 | }; | ||
159 | |||
160 | #define SCRUB_FLAG_SW_PROG BIT(SCRUB_SW_PROG) | ||
161 | #define SCRUB_FLAG_SW_SRC BIT(SCRUB_SW_SRC_CORR) | ||
162 | #define SCRUB_FLAG_SW_PROG_SRC BIT(SCRUB_SW_PROG_SRC_CORR) | ||
163 | #define SCRUB_FLAG_SW_TUN BIT(SCRUB_SW_SCRUB_TUNABLE) | ||
164 | #define SCRUB_FLAG_HW_PROG BIT(SCRUB_HW_PROG) | ||
165 | #define SCRUB_FLAG_HW_SRC BIT(SCRUB_HW_SRC_CORR) | ||
166 | #define SCRUB_FLAG_HW_PROG_SRC BIT(SCRUB_HW_PROG_SRC_CORR) | ||
167 | #define SCRUB_FLAG_HW_TUN BIT(SCRUB_HW_TUNABLE) | ||
168 | |||
169 | enum mci_sysfs_status { | ||
170 | MCI_SYSFS_INACTIVE = 0, /* sysfs entries NOT registered */ | ||
171 | MCI_SYSFS_ACTIVE /* sysfs entries ARE registered */ | ||
172 | }; | ||
173 | |||
174 | /* FIXME - should have notify capabilities: NMI, LOG, PROC, etc */ | ||
175 | |||
176 | /* | ||
177 | * There are several things to be aware of that aren't at all obvious: | ||
178 | * | ||
179 | * | ||
180 | * SOCKETS, SOCKET SETS, BANKS, ROWS, CHIP-SELECT ROWS, CHANNELS, etc.. | ||
181 | * | ||
182 | * These are some of the many terms that are thrown about that don't always | ||
183 | * mean what people think they mean (Inconceivable!). In the interest of | ||
184 | * creating a common ground for discussion, terms and their definitions | ||
185 | * will be established. | ||
186 | * | ||
187 | * Memory devices: The individual chip on a memory stick. These devices | ||
188 | * commonly output 4 and 8 bits each. Grouping several | ||
189 | * of these in parallel provides 64 bits which is common | ||
190 | * for a memory stick. | ||
191 | * | ||
192 | * Memory Stick: A printed circuit board that agregates multiple | ||
193 | * memory devices in parallel. This is the atomic | ||
194 | * memory component that is purchaseable by Joe consumer | ||
195 | * and loaded into a memory socket. | ||
196 | * | ||
197 | * Socket: A physical connector on the motherboard that accepts | ||
198 | * a single memory stick. | ||
199 | * | ||
200 | * Channel: Set of memory devices on a memory stick that must be | ||
201 | * grouped in parallel with one or more additional | ||
202 | * channels from other memory sticks. This parallel | ||
203 | * grouping of the output from multiple channels are | ||
204 | * necessary for the smallest granularity of memory access. | ||
205 | * Some memory controllers are capable of single channel - | ||
206 | * which means that memory sticks can be loaded | ||
207 | * individually. Other memory controllers are only | ||
208 | * capable of dual channel - which means that memory | ||
209 | * sticks must be loaded as pairs (see "socket set"). | ||
210 | * | ||
211 | * Chip-select row: All of the memory devices that are selected together. | ||
212 | * for a single, minimum grain of memory access. | ||
213 | * This selects all of the parallel memory devices across | ||
214 | * all of the parallel channels. Common chip-select rows | ||
215 | * for single channel are 64 bits, for dual channel 128 | ||
216 | * bits. | ||
217 | * | ||
218 | * Single-Ranked stick: A Single-ranked stick has 1 chip-select row of memmory. | ||
219 | * Motherboards commonly drive two chip-select pins to | ||
220 | * a memory stick. A single-ranked stick, will occupy | ||
221 | * only one of those rows. The other will be unused. | ||
222 | * | ||
223 | * Double-Ranked stick: A double-ranked stick has two chip-select rows which | ||
224 | * access different sets of memory devices. The two | ||
225 | * rows cannot be accessed concurrently. | ||
226 | * | ||
227 | * Double-sided stick: DEPRECATED TERM, see Double-Ranked stick. | ||
228 | * A double-sided stick has two chip-select rows which | ||
229 | * access different sets of memory devices. The two | ||
230 | * rows cannot be accessed concurrently. "Double-sided" | ||
231 | * is irrespective of the memory devices being mounted | ||
232 | * on both sides of the memory stick. | ||
233 | * | ||
234 | * Socket set: All of the memory sticks that are required for for | ||
235 | * a single memory access or all of the memory sticks | ||
236 | * spanned by a chip-select row. A single socket set | ||
237 | * has two chip-select rows and if double-sided sticks | ||
238 | * are used these will occupy those chip-select rows. | ||
239 | * | ||
240 | * Bank: This term is avoided because it is unclear when | ||
241 | * needing to distinguish between chip-select rows and | ||
242 | * socket sets. | ||
243 | * | ||
244 | * Controller pages: | ||
245 | * | ||
246 | * Physical pages: | ||
247 | * | ||
248 | * Virtual pages: | ||
249 | * | ||
250 | * | ||
251 | * STRUCTURE ORGANIZATION AND CHOICES | ||
252 | * | ||
253 | * | ||
254 | * | ||
255 | * PS - I enjoyed writing all that about as much as you enjoyed reading it. | ||
256 | */ | ||
257 | |||
258 | |||
259 | struct channel_info { | ||
260 | int chan_idx; /* channel index */ | ||
261 | u32 ce_count; /* Correctable Errors for this CHANNEL */ | ||
262 | char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */ | ||
263 | struct csrow_info *csrow; /* the parent */ | ||
264 | }; | ||
265 | |||
266 | |||
267 | struct csrow_info { | ||
268 | unsigned long first_page; /* first page number in dimm */ | ||
269 | unsigned long last_page; /* last page number in dimm */ | ||
270 | unsigned long page_mask; /* used for interleaving - | ||
271 | 0UL for non intlv */ | ||
272 | u32 nr_pages; /* number of pages in csrow */ | ||
273 | u32 grain; /* granularity of reported error in bytes */ | ||
274 | int csrow_idx; /* the chip-select row */ | ||
275 | enum dev_type dtype; /* memory device type */ | ||
276 | u32 ue_count; /* Uncorrectable Errors for this csrow */ | ||
277 | u32 ce_count; /* Correctable Errors for this csrow */ | ||
278 | enum mem_type mtype; /* memory csrow type */ | ||
279 | enum edac_type edac_mode; /* EDAC mode for this csrow */ | ||
280 | struct mem_ctl_info *mci; /* the parent */ | ||
281 | |||
282 | struct kobject kobj; /* sysfs kobject for this csrow */ | ||
283 | |||
284 | /* FIXME the number of CHANNELs might need to become dynamic */ | ||
285 | u32 nr_channels; | ||
286 | struct channel_info *channels; | ||
287 | }; | ||
288 | |||
289 | |||
290 | struct mem_ctl_info { | ||
291 | struct list_head link; /* for global list of mem_ctl_info structs */ | ||
292 | unsigned long mtype_cap; /* memory types supported by mc */ | ||
293 | unsigned long edac_ctl_cap; /* Mem controller EDAC capabilities */ | ||
294 | unsigned long edac_cap; /* configuration capabilities - this is | ||
295 | closely related to edac_ctl_cap. The | ||
296 | difference is that the controller | ||
297 | may be capable of s4ecd4ed which would | ||
298 | be listed in edac_ctl_cap, but if | ||
299 | channels aren't capable of s4ecd4ed then the | ||
300 | edac_cap would not have that capability. */ | ||
301 | unsigned long scrub_cap; /* chipset scrub capabilities */ | ||
302 | enum scrub_type scrub_mode; /* current scrub mode */ | ||
303 | |||
304 | enum mci_sysfs_status sysfs_active; /* status of sysfs */ | ||
305 | |||
306 | /* pointer to edac checking routine */ | ||
307 | void (*edac_check) (struct mem_ctl_info * mci); | ||
308 | /* | ||
309 | * Remaps memory pages: controller pages to physical pages. | ||
310 | * For most MC's, this will be NULL. | ||
311 | */ | ||
312 | /* FIXME - why not send the phys page to begin with? */ | ||
313 | unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci, | ||
314 | unsigned long page); | ||
315 | int mc_idx; | ||
316 | int nr_csrows; | ||
317 | struct csrow_info *csrows; | ||
318 | /* | ||
319 | * FIXME - what about controllers on other busses? - IDs must be | ||
320 | * unique. pdev pointer should be sufficiently unique, but | ||
321 | * BUS:SLOT.FUNC numbers may not be unique. | ||
322 | */ | ||
323 | struct pci_dev *pdev; | ||
324 | const char *mod_name; | ||
325 | const char *mod_ver; | ||
326 | const char *ctl_name; | ||
327 | char proc_name[MC_PROC_NAME_MAX_LEN + 1]; | ||
328 | void *pvt_info; | ||
329 | u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */ | ||
330 | u32 ce_noinfo_count; /* Correctable Errors w/o info */ | ||
331 | u32 ue_count; /* Total Uncorrectable Errors for this MC */ | ||
332 | u32 ce_count; /* Total Correctable Errors for this MC */ | ||
333 | unsigned long start_time; /* mci load start time (in jiffies) */ | ||
334 | |||
335 | /* this stuff is for safe removal of mc devices from global list while | ||
336 | * NMI handlers may be traversing list | ||
337 | */ | ||
338 | struct rcu_head rcu; | ||
339 | struct completion complete; | ||
340 | |||
341 | /* edac sysfs device control */ | ||
342 | struct kobject edac_mci_kobj; | ||
343 | }; | ||
344 | |||
345 | |||
346 | |||
347 | /* write all or some bits in a byte-register*/ | ||
348 | static inline void pci_write_bits8(struct pci_dev *pdev, int offset, | ||
349 | u8 value, u8 mask) | ||
350 | { | ||
351 | if (mask != 0xff) { | ||
352 | u8 buf; | ||
353 | pci_read_config_byte(pdev, offset, &buf); | ||
354 | value &= mask; | ||
355 | buf &= ~mask; | ||
356 | value |= buf; | ||
357 | } | ||
358 | pci_write_config_byte(pdev, offset, value); | ||
359 | } | ||
360 | |||
361 | |||
362 | /* write all or some bits in a word-register*/ | ||
363 | static inline void pci_write_bits16(struct pci_dev *pdev, int offset, | ||
364 | u16 value, u16 mask) | ||
365 | { | ||
366 | if (mask != 0xffff) { | ||
367 | u16 buf; | ||
368 | pci_read_config_word(pdev, offset, &buf); | ||
369 | value &= mask; | ||
370 | buf &= ~mask; | ||
371 | value |= buf; | ||
372 | } | ||
373 | pci_write_config_word(pdev, offset, value); | ||
374 | } | ||
375 | |||
376 | |||
377 | /* write all or some bits in a dword-register*/ | ||
378 | static inline void pci_write_bits32(struct pci_dev *pdev, int offset, | ||
379 | u32 value, u32 mask) | ||
380 | { | ||
381 | if (mask != 0xffff) { | ||
382 | u32 buf; | ||
383 | pci_read_config_dword(pdev, offset, &buf); | ||
384 | value &= mask; | ||
385 | buf &= ~mask; | ||
386 | value |= buf; | ||
387 | } | ||
388 | pci_write_config_dword(pdev, offset, value); | ||
389 | } | ||
390 | |||
391 | |||
392 | #ifdef CONFIG_EDAC_DEBUG | ||
393 | void edac_mc_dump_channel(struct channel_info *chan); | ||
394 | void edac_mc_dump_mci(struct mem_ctl_info *mci); | ||
395 | void edac_mc_dump_csrow(struct csrow_info *csrow); | ||
396 | #endif /* CONFIG_EDAC_DEBUG */ | ||
397 | |||
398 | extern int edac_mc_add_mc(struct mem_ctl_info *mci); | ||
399 | extern int edac_mc_del_mc(struct mem_ctl_info *mci); | ||
400 | |||
401 | extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, | ||
402 | unsigned long page); | ||
403 | |||
404 | extern struct mem_ctl_info *edac_mc_find_mci_by_pdev(struct pci_dev | ||
405 | *pdev); | ||
406 | |||
407 | extern void edac_mc_scrub_block(unsigned long page, | ||
408 | unsigned long offset, u32 size); | ||
409 | |||
410 | /* | ||
411 | * The no info errors are used when error overflows are reported. | ||
412 | * There are a limited number of error logging registers that can | ||
413 | * be exausted. When all registers are exhausted and an additional | ||
414 | * error occurs then an error overflow register records that an | ||
415 | * error occured and the type of error, but doesn't have any | ||
416 | * further information. The ce/ue versions make for cleaner | ||
417 | * reporting logic and function interface - reduces conditional | ||
418 | * statement clutter and extra function arguments. | ||
419 | */ | ||
420 | extern void edac_mc_handle_ce(struct mem_ctl_info *mci, | ||
421 | unsigned long page_frame_number, | ||
422 | unsigned long offset_in_page, | ||
423 | unsigned long syndrome, | ||
424 | int row, int channel, const char *msg); | ||
425 | |||
426 | extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, | ||
427 | const char *msg); | ||
428 | |||
429 | extern void edac_mc_handle_ue(struct mem_ctl_info *mci, | ||
430 | unsigned long page_frame_number, | ||
431 | unsigned long offset_in_page, | ||
432 | int row, const char *msg); | ||
433 | |||
434 | extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, | ||
435 | const char *msg); | ||
436 | |||
437 | /* | ||
438 | * This kmalloc's and initializes all the structures. | ||
439 | * Can't be used if all structures don't have the same lifetime. | ||
440 | */ | ||
441 | extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, | ||
442 | unsigned nr_csrows, unsigned nr_chans); | ||
443 | |||
444 | /* Free an mc previously allocated by edac_mc_alloc() */ | ||
445 | extern void edac_mc_free(struct mem_ctl_info *mci); | ||
446 | |||
447 | |||
448 | #endif /* _EDAC_MC_H_ */ | ||