aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac/edac_mc.h
diff options
context:
space:
mode:
authorAlan Cox <alan@lxorguk.ukuu.org.uk>2006-01-18 20:44:13 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-18 22:20:31 -0500
commitda9bb1d27b21cb24cbb6a2efb5d3c464d357a01e (patch)
tree016b66985a651d071d3873e74b115108ddf0b3f5 /drivers/edac/edac_mc.h
parent2f768af73fea4c70f9046388a7ff648ad11f028e (diff)
[PATCH] EDAC: core EDAC support code
This is a subset of the bluesmoke project core code, stripped of the NMI work which isn't ready to merge and some of the "interesting" proc functionality that needs reworking or just has no place in kernel. It requires no core kernel changes except the added scrub functions already posted. The goal is to merge further functionality only after the core code is accepted and proven in the base kernel, and only at the point the upstream extras are really ready to merge. From: doug thompson <norsk5@xmission.com> This converts EDAC to sysfs and is the final chunk neccessary before EDAC has a stable user space API and can be considered for submission into the base kernel. Signed-off-by: Alan Cox <alan@redhat.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com> Signed-off-by: doug thompson <norsk5@xmission.com> Signed-off-by: Pavel Machek <pavel@suse.cz> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/edac/edac_mc.h')
-rw-r--r--drivers/edac/edac_mc.h448
1 files changed, 448 insertions, 0 deletions
diff --git a/drivers/edac/edac_mc.h b/drivers/edac/edac_mc.h
new file mode 100644
index 000000000000..75ecf484a43a
--- /dev/null
+++ b/drivers/edac/edac_mc.h
@@ -0,0 +1,448 @@
1/*
2 * MC kernel module
3 * (C) 2003 Linux Networx (http://lnxi.com)
4 * This file may be distributed under the terms of the
5 * GNU General Public License.
6 *
7 * Written by Thayne Harbaugh
8 * Based on work by Dan Hollis <goemon at anime dot net> and others.
9 * http://www.anime.net/~goemon/linux-ecc/
10 *
11 * NMI handling support added by
12 * Dave Peterson <dsp@llnl.gov> <dave_peterson@pobox.com>
13 *
14 * $Id: edac_mc.h,v 1.4.2.10 2005/10/05 00:43:44 dsp_llnl Exp $
15 *
16 */
17
18
19#ifndef _EDAC_MC_H_
20#define _EDAC_MC_H_
21
22
23#include <linux/config.h>
24#include <linux/kernel.h>
25#include <linux/types.h>
26#include <linux/module.h>
27#include <linux/spinlock.h>
28#include <linux/smp.h>
29#include <linux/pci.h>
30#include <linux/time.h>
31#include <linux/nmi.h>
32#include <linux/rcupdate.h>
33#include <linux/completion.h>
34#include <linux/kobject.h>
35
36
37#define EDAC_MC_LABEL_LEN 31
38#define MC_PROC_NAME_MAX_LEN 7
39
40#if PAGE_SHIFT < 20
41#define PAGES_TO_MiB( pages ) ( ( pages ) >> ( 20 - PAGE_SHIFT ) )
42#else /* PAGE_SHIFT > 20 */
43#define PAGES_TO_MiB( pages ) ( ( pages ) << ( PAGE_SHIFT - 20 ) )
44#endif
45
46#ifdef CONFIG_EDAC_DEBUG
47extern int edac_debug_level;
48#define edac_debug_printk(level, fmt, args...) \
49do { if (level <= edac_debug_level) printk(KERN_DEBUG fmt, ##args); } while(0)
50#define debugf0( ... ) edac_debug_printk(0, __VA_ARGS__ )
51#define debugf1( ... ) edac_debug_printk(1, __VA_ARGS__ )
52#define debugf2( ... ) edac_debug_printk(2, __VA_ARGS__ )
53#define debugf3( ... ) edac_debug_printk(3, __VA_ARGS__ )
54#define debugf4( ... ) edac_debug_printk(4, __VA_ARGS__ )
55#else /* !CONFIG_EDAC_DEBUG */
56#define debugf0( ... )
57#define debugf1( ... )
58#define debugf2( ... )
59#define debugf3( ... )
60#define debugf4( ... )
61#endif /* !CONFIG_EDAC_DEBUG */
62
63
64#define bs_xstr(s) bs_str(s)
65#define bs_str(s) #s
66#define BS_MOD_STR bs_xstr(KBUILD_BASENAME)
67
68#define BIT(x) (1 << (x))
69
70#define PCI_VEND_DEV(vend, dev) PCI_VENDOR_ID_ ## vend, PCI_DEVICE_ID_ ## vend ## _ ## dev
71
72/* memory devices */
73enum dev_type {
74 DEV_UNKNOWN = 0,
75 DEV_X1,
76 DEV_X2,
77 DEV_X4,
78 DEV_X8,
79 DEV_X16,
80 DEV_X32, /* Do these parts exist? */
81 DEV_X64 /* Do these parts exist? */
82};
83
84#define DEV_FLAG_UNKNOWN BIT(DEV_UNKNOWN)
85#define DEV_FLAG_X1 BIT(DEV_X1)
86#define DEV_FLAG_X2 BIT(DEV_X2)
87#define DEV_FLAG_X4 BIT(DEV_X4)
88#define DEV_FLAG_X8 BIT(DEV_X8)
89#define DEV_FLAG_X16 BIT(DEV_X16)
90#define DEV_FLAG_X32 BIT(DEV_X32)
91#define DEV_FLAG_X64 BIT(DEV_X64)
92
93/* memory types */
94enum mem_type {
95 MEM_EMPTY = 0, /* Empty csrow */
96 MEM_RESERVED, /* Reserved csrow type */
97 MEM_UNKNOWN, /* Unknown csrow type */
98 MEM_FPM, /* Fast page mode */
99 MEM_EDO, /* Extended data out */
100 MEM_BEDO, /* Burst Extended data out */
101 MEM_SDR, /* Single data rate SDRAM */
102 MEM_RDR, /* Registered single data rate SDRAM */
103 MEM_DDR, /* Double data rate SDRAM */
104 MEM_RDDR, /* Registered Double data rate SDRAM */
105 MEM_RMBS /* Rambus DRAM */
106};
107
108#define MEM_FLAG_EMPTY BIT(MEM_EMPTY)
109#define MEM_FLAG_RESERVED BIT(MEM_RESERVED)
110#define MEM_FLAG_UNKNOWN BIT(MEM_UNKNOWN)
111#define MEM_FLAG_FPM BIT(MEM_FPM)
112#define MEM_FLAG_EDO BIT(MEM_EDO)
113#define MEM_FLAG_BEDO BIT(MEM_BEDO)
114#define MEM_FLAG_SDR BIT(MEM_SDR)
115#define MEM_FLAG_RDR BIT(MEM_RDR)
116#define MEM_FLAG_DDR BIT(MEM_DDR)
117#define MEM_FLAG_RDDR BIT(MEM_RDDR)
118#define MEM_FLAG_RMBS BIT(MEM_RMBS)
119
120
121/* chipset Error Detection and Correction capabilities and mode */
122enum edac_type {
123 EDAC_UNKNOWN = 0, /* Unknown if ECC is available */
124 EDAC_NONE, /* Doesnt support ECC */
125 EDAC_RESERVED, /* Reserved ECC type */
126 EDAC_PARITY, /* Detects parity errors */
127 EDAC_EC, /* Error Checking - no correction */
128 EDAC_SECDED, /* Single bit error correction, Double detection */
129 EDAC_S2ECD2ED, /* Chipkill x2 devices - do these exist? */
130 EDAC_S4ECD4ED, /* Chipkill x4 devices */
131 EDAC_S8ECD8ED, /* Chipkill x8 devices */
132 EDAC_S16ECD16ED, /* Chipkill x16 devices */
133};
134
135#define EDAC_FLAG_UNKNOWN BIT(EDAC_UNKNOWN)
136#define EDAC_FLAG_NONE BIT(EDAC_NONE)
137#define EDAC_FLAG_PARITY BIT(EDAC_PARITY)
138#define EDAC_FLAG_EC BIT(EDAC_EC)
139#define EDAC_FLAG_SECDED BIT(EDAC_SECDED)
140#define EDAC_FLAG_S2ECD2ED BIT(EDAC_S2ECD2ED)
141#define EDAC_FLAG_S4ECD4ED BIT(EDAC_S4ECD4ED)
142#define EDAC_FLAG_S8ECD8ED BIT(EDAC_S8ECD8ED)
143#define EDAC_FLAG_S16ECD16ED BIT(EDAC_S16ECD16ED)
144
145
146/* scrubbing capabilities */
147enum scrub_type {
148 SCRUB_UNKNOWN = 0, /* Unknown if scrubber is available */
149 SCRUB_NONE, /* No scrubber */
150 SCRUB_SW_PROG, /* SW progressive (sequential) scrubbing */
151 SCRUB_SW_SRC, /* Software scrub only errors */
152 SCRUB_SW_PROG_SRC, /* Progressive software scrub from an error */
153 SCRUB_SW_TUNABLE, /* Software scrub frequency is tunable */
154 SCRUB_HW_PROG, /* HW progressive (sequential) scrubbing */
155 SCRUB_HW_SRC, /* Hardware scrub only errors */
156 SCRUB_HW_PROG_SRC, /* Progressive hardware scrub from an error */
157 SCRUB_HW_TUNABLE /* Hardware scrub frequency is tunable */
158};
159
160#define SCRUB_FLAG_SW_PROG BIT(SCRUB_SW_PROG)
161#define SCRUB_FLAG_SW_SRC BIT(SCRUB_SW_SRC_CORR)
162#define SCRUB_FLAG_SW_PROG_SRC BIT(SCRUB_SW_PROG_SRC_CORR)
163#define SCRUB_FLAG_SW_TUN BIT(SCRUB_SW_SCRUB_TUNABLE)
164#define SCRUB_FLAG_HW_PROG BIT(SCRUB_HW_PROG)
165#define SCRUB_FLAG_HW_SRC BIT(SCRUB_HW_SRC_CORR)
166#define SCRUB_FLAG_HW_PROG_SRC BIT(SCRUB_HW_PROG_SRC_CORR)
167#define SCRUB_FLAG_HW_TUN BIT(SCRUB_HW_TUNABLE)
168
169enum mci_sysfs_status {
170 MCI_SYSFS_INACTIVE = 0, /* sysfs entries NOT registered */
171 MCI_SYSFS_ACTIVE /* sysfs entries ARE registered */
172};
173
174/* FIXME - should have notify capabilities: NMI, LOG, PROC, etc */
175
176/*
177 * There are several things to be aware of that aren't at all obvious:
178 *
179 *
180 * SOCKETS, SOCKET SETS, BANKS, ROWS, CHIP-SELECT ROWS, CHANNELS, etc..
181 *
182 * These are some of the many terms that are thrown about that don't always
183 * mean what people think they mean (Inconceivable!). In the interest of
184 * creating a common ground for discussion, terms and their definitions
185 * will be established.
186 *
187 * Memory devices: The individual chip on a memory stick. These devices
188 * commonly output 4 and 8 bits each. Grouping several
189 * of these in parallel provides 64 bits which is common
190 * for a memory stick.
191 *
192 * Memory Stick: A printed circuit board that agregates multiple
193 * memory devices in parallel. This is the atomic
194 * memory component that is purchaseable by Joe consumer
195 * and loaded into a memory socket.
196 *
197 * Socket: A physical connector on the motherboard that accepts
198 * a single memory stick.
199 *
200 * Channel: Set of memory devices on a memory stick that must be
201 * grouped in parallel with one or more additional
202 * channels from other memory sticks. This parallel
203 * grouping of the output from multiple channels are
204 * necessary for the smallest granularity of memory access.
205 * Some memory controllers are capable of single channel -
206 * which means that memory sticks can be loaded
207 * individually. Other memory controllers are only
208 * capable of dual channel - which means that memory
209 * sticks must be loaded as pairs (see "socket set").
210 *
211 * Chip-select row: All of the memory devices that are selected together.
212 * for a single, minimum grain of memory access.
213 * This selects all of the parallel memory devices across
214 * all of the parallel channels. Common chip-select rows
215 * for single channel are 64 bits, for dual channel 128
216 * bits.
217 *
218 * Single-Ranked stick: A Single-ranked stick has 1 chip-select row of memmory.
219 * Motherboards commonly drive two chip-select pins to
220 * a memory stick. A single-ranked stick, will occupy
221 * only one of those rows. The other will be unused.
222 *
223 * Double-Ranked stick: A double-ranked stick has two chip-select rows which
224 * access different sets of memory devices. The two
225 * rows cannot be accessed concurrently.
226 *
227 * Double-sided stick: DEPRECATED TERM, see Double-Ranked stick.
228 * A double-sided stick has two chip-select rows which
229 * access different sets of memory devices. The two
230 * rows cannot be accessed concurrently. "Double-sided"
231 * is irrespective of the memory devices being mounted
232 * on both sides of the memory stick.
233 *
234 * Socket set: All of the memory sticks that are required for for
235 * a single memory access or all of the memory sticks
236 * spanned by a chip-select row. A single socket set
237 * has two chip-select rows and if double-sided sticks
238 * are used these will occupy those chip-select rows.
239 *
240 * Bank: This term is avoided because it is unclear when
241 * needing to distinguish between chip-select rows and
242 * socket sets.
243 *
244 * Controller pages:
245 *
246 * Physical pages:
247 *
248 * Virtual pages:
249 *
250 *
251 * STRUCTURE ORGANIZATION AND CHOICES
252 *
253 *
254 *
255 * PS - I enjoyed writing all that about as much as you enjoyed reading it.
256 */
257
258
259struct channel_info {
260 int chan_idx; /* channel index */
261 u32 ce_count; /* Correctable Errors for this CHANNEL */
262 char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */
263 struct csrow_info *csrow; /* the parent */
264};
265
266
267struct csrow_info {
268 unsigned long first_page; /* first page number in dimm */
269 unsigned long last_page; /* last page number in dimm */
270 unsigned long page_mask; /* used for interleaving -
271 0UL for non intlv */
272 u32 nr_pages; /* number of pages in csrow */
273 u32 grain; /* granularity of reported error in bytes */
274 int csrow_idx; /* the chip-select row */
275 enum dev_type dtype; /* memory device type */
276 u32 ue_count; /* Uncorrectable Errors for this csrow */
277 u32 ce_count; /* Correctable Errors for this csrow */
278 enum mem_type mtype; /* memory csrow type */
279 enum edac_type edac_mode; /* EDAC mode for this csrow */
280 struct mem_ctl_info *mci; /* the parent */
281
282 struct kobject kobj; /* sysfs kobject for this csrow */
283
284 /* FIXME the number of CHANNELs might need to become dynamic */
285 u32 nr_channels;
286 struct channel_info *channels;
287};
288
289
290struct mem_ctl_info {
291 struct list_head link; /* for global list of mem_ctl_info structs */
292 unsigned long mtype_cap; /* memory types supported by mc */
293 unsigned long edac_ctl_cap; /* Mem controller EDAC capabilities */
294 unsigned long edac_cap; /* configuration capabilities - this is
295 closely related to edac_ctl_cap. The
296 difference is that the controller
297 may be capable of s4ecd4ed which would
298 be listed in edac_ctl_cap, but if
299 channels aren't capable of s4ecd4ed then the
300 edac_cap would not have that capability. */
301 unsigned long scrub_cap; /* chipset scrub capabilities */
302 enum scrub_type scrub_mode; /* current scrub mode */
303
304 enum mci_sysfs_status sysfs_active; /* status of sysfs */
305
306 /* pointer to edac checking routine */
307 void (*edac_check) (struct mem_ctl_info * mci);
308 /*
309 * Remaps memory pages: controller pages to physical pages.
310 * For most MC's, this will be NULL.
311 */
312 /* FIXME - why not send the phys page to begin with? */
313 unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci,
314 unsigned long page);
315 int mc_idx;
316 int nr_csrows;
317 struct csrow_info *csrows;
318 /*
319 * FIXME - what about controllers on other busses? - IDs must be
320 * unique. pdev pointer should be sufficiently unique, but
321 * BUS:SLOT.FUNC numbers may not be unique.
322 */
323 struct pci_dev *pdev;
324 const char *mod_name;
325 const char *mod_ver;
326 const char *ctl_name;
327 char proc_name[MC_PROC_NAME_MAX_LEN + 1];
328 void *pvt_info;
329 u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */
330 u32 ce_noinfo_count; /* Correctable Errors w/o info */
331 u32 ue_count; /* Total Uncorrectable Errors for this MC */
332 u32 ce_count; /* Total Correctable Errors for this MC */
333 unsigned long start_time; /* mci load start time (in jiffies) */
334
335 /* this stuff is for safe removal of mc devices from global list while
336 * NMI handlers may be traversing list
337 */
338 struct rcu_head rcu;
339 struct completion complete;
340
341 /* edac sysfs device control */
342 struct kobject edac_mci_kobj;
343};
344
345
346
347/* write all or some bits in a byte-register*/
348static inline void pci_write_bits8(struct pci_dev *pdev, int offset,
349 u8 value, u8 mask)
350{
351 if (mask != 0xff) {
352 u8 buf;
353 pci_read_config_byte(pdev, offset, &buf);
354 value &= mask;
355 buf &= ~mask;
356 value |= buf;
357 }
358 pci_write_config_byte(pdev, offset, value);
359}
360
361
362/* write all or some bits in a word-register*/
363static inline void pci_write_bits16(struct pci_dev *pdev, int offset,
364 u16 value, u16 mask)
365{
366 if (mask != 0xffff) {
367 u16 buf;
368 pci_read_config_word(pdev, offset, &buf);
369 value &= mask;
370 buf &= ~mask;
371 value |= buf;
372 }
373 pci_write_config_word(pdev, offset, value);
374}
375
376
377/* write all or some bits in a dword-register*/
378static inline void pci_write_bits32(struct pci_dev *pdev, int offset,
379 u32 value, u32 mask)
380{
381 if (mask != 0xffff) {
382 u32 buf;
383 pci_read_config_dword(pdev, offset, &buf);
384 value &= mask;
385 buf &= ~mask;
386 value |= buf;
387 }
388 pci_write_config_dword(pdev, offset, value);
389}
390
391
392#ifdef CONFIG_EDAC_DEBUG
393void edac_mc_dump_channel(struct channel_info *chan);
394void edac_mc_dump_mci(struct mem_ctl_info *mci);
395void edac_mc_dump_csrow(struct csrow_info *csrow);
396#endif /* CONFIG_EDAC_DEBUG */
397
398extern int edac_mc_add_mc(struct mem_ctl_info *mci);
399extern int edac_mc_del_mc(struct mem_ctl_info *mci);
400
401extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
402 unsigned long page);
403
404extern struct mem_ctl_info *edac_mc_find_mci_by_pdev(struct pci_dev
405 *pdev);
406
407extern void edac_mc_scrub_block(unsigned long page,
408 unsigned long offset, u32 size);
409
410/*
411 * The no info errors are used when error overflows are reported.
412 * There are a limited number of error logging registers that can
413 * be exausted. When all registers are exhausted and an additional
414 * error occurs then an error overflow register records that an
415 * error occured and the type of error, but doesn't have any
416 * further information. The ce/ue versions make for cleaner
417 * reporting logic and function interface - reduces conditional
418 * statement clutter and extra function arguments.
419 */
420extern void edac_mc_handle_ce(struct mem_ctl_info *mci,
421 unsigned long page_frame_number,
422 unsigned long offset_in_page,
423 unsigned long syndrome,
424 int row, int channel, const char *msg);
425
426extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
427 const char *msg);
428
429extern void edac_mc_handle_ue(struct mem_ctl_info *mci,
430 unsigned long page_frame_number,
431 unsigned long offset_in_page,
432 int row, const char *msg);
433
434extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
435 const char *msg);
436
437/*
438 * This kmalloc's and initializes all the structures.
439 * Can't be used if all structures don't have the same lifetime.
440 */
441extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt,
442 unsigned nr_csrows, unsigned nr_chans);
443
444/* Free an mc previously allocated by edac_mc_alloc() */
445extern void edac_mc_free(struct mem_ctl_info *mci);
446
447
448#endif /* _EDAC_MC_H_ */