aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@redhat.com>2012-04-18 14:20:50 -0400
committerMauro Carvalho Chehab <mchehab@redhat.com>2012-05-28 18:10:59 -0400
commit4275be63559719c3149b19751029f1b0f1b26775 (patch)
treed215a184f4278d7bc9095f18eb4c748149e241f3 /drivers/edac
parent982216a4290543fe73ae4f0a156f3d7906bd9b73 (diff)
edac: Change internal representation to work with layers
Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core was written with the idea that memory controllers are able to directly access csrows. This is not true for FB-DIMM and RAMBUS memory controllers. Also, some recent advanced memory controllers don't present a per-csrows view. Instead, they view memories as DIMMs, instead of ranks. So, change the allocation and error report routines to allow them to work with all types of architectures. This will allow the removal of several hacks with FB-DIMM and RAMBUS memory controllers. Also, several tests were done on different platforms using different x86 drivers. TODO: a multi-rank DIMMs are currently represented by multiple DIMM entries in struct dimm_info. That means that changing a label for one rank won't change the same label for the other ranks at the same DIMM. This bug is present since the beginning of the EDAC, so it is not a big deal. However, on several drivers, it is possible to fix this issue, but it should be a per-driver fix, as the csrow => DIMM arrangement may not be equal for all. So, don't try to fix it here yet. I tried to make this patch as short as possible, preceding it with several other patches that simplified the logic here. Yet, as the internal API changes, all drivers need changes. The changes are generally bigger in the drivers for FB-DIMMs. Cc: Aristeu Rozanski <arozansk@redhat.com> Cc: Doug Thompson <norsk5@yahoo.com> Cc: Borislav Petkov <borislav.petkov@amd.com> Cc: Mark Gross <mark.gross@intel.com> Cc: Jason Uhlenkott <juhlenko@akamai.com> Cc: Tim Small <tim@buttersideup.com> Cc: Ranganathan Desikan <ravi@jetztechnologies.com> Cc: "Arvind R." <arvino55@gmail.com> Cc: Olof Johansson <olof@lixom.net> Cc: Egor Martovetsky <egor@pasemi.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Michal Marek <mmarek@suse.cz> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Joe Perches <joe@perches.com> Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Hitoshi Mitake <h.mitake@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: "Niklas Söderlund" <niklas.soderlund@ericsson.com> Cc: Shaohui Xie <Shaohui.Xie@freescale.com> Cc: Josh Boyer <jwboyer@gmail.com> Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/edac_core.h99
-rw-r--r--drivers/edac/edac_mc.c702
2 files changed, 527 insertions, 274 deletions
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index e48ab3108ad8..1286c5e1bdc0 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -447,8 +447,12 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset,
447 447
448#endif /* CONFIG_PCI */ 448#endif /* CONFIG_PCI */
449 449
450extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, 450struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
451 unsigned nr_chans, int edac_index); 451 unsigned nr_chans, int edac_index);
452struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
453 unsigned n_layers,
454 struct edac_mc_layer *layers,
455 unsigned sz_pvt);
452extern int edac_mc_add_mc(struct mem_ctl_info *mci); 456extern int edac_mc_add_mc(struct mem_ctl_info *mci);
453extern void edac_mc_free(struct mem_ctl_info *mci); 457extern void edac_mc_free(struct mem_ctl_info *mci);
454extern struct mem_ctl_info *edac_mc_find(int idx); 458extern struct mem_ctl_info *edac_mc_find(int idx);
@@ -467,24 +471,78 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
467 * reporting logic and function interface - reduces conditional 471 * reporting logic and function interface - reduces conditional
468 * statement clutter and extra function arguments. 472 * statement clutter and extra function arguments.
469 */ 473 */
470extern void edac_mc_handle_ce(struct mem_ctl_info *mci, 474
471 unsigned long page_frame_number, 475void edac_mc_handle_error(const enum hw_event_mc_err_type type,
472 unsigned long offset_in_page, 476 struct mem_ctl_info *mci,
473 unsigned long syndrome, int row, int channel, 477 const unsigned long page_frame_number,
474 const char *msg); 478 const unsigned long offset_in_page,
475extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, 479 const unsigned long syndrome,
476 const char *msg); 480 const int layer0,
477extern void edac_mc_handle_ue(struct mem_ctl_info *mci, 481 const int layer1,
478 unsigned long page_frame_number, 482 const int layer2,
479 unsigned long offset_in_page, int row, 483 const char *msg,
480 const char *msg); 484 const char *other_detail,
481extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, 485 const void *mcelog);
482 const char *msg); 486
483extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int csrow, 487static inline void edac_mc_handle_ce(struct mem_ctl_info *mci,
484 unsigned int channel0, unsigned int channel1, 488 unsigned long page_frame_number,
485 char *msg); 489 unsigned long offset_in_page,
486extern void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, unsigned int csrow, 490 unsigned long syndrome, int row, int channel,
487 unsigned int channel, char *msg); 491 const char *msg)
492{
493 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
494 page_frame_number, offset_in_page, syndrome,
495 row, channel, -1, msg, NULL, NULL);
496}
497
498static inline void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
499 const char *msg)
500{
501 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
502 0, 0, 0, -1, -1, -1, msg, NULL, NULL);
503}
504
505static inline void edac_mc_handle_ue(struct mem_ctl_info *mci,
506 unsigned long page_frame_number,
507 unsigned long offset_in_page, int row,
508 const char *msg)
509{
510 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
511 page_frame_number, offset_in_page, 0,
512 row, -1, -1, msg, NULL, NULL);
513}
514
515static inline void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
516 const char *msg)
517{
518 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
519 0, 0, 0, -1, -1, -1, msg, NULL, NULL);
520}
521
522static inline void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
523 unsigned int csrow,
524 unsigned int channel0,
525 unsigned int channel1,
526 char *msg)
527{
528 /*
529 *FIXME: The error can also be at channel1 (e. g. at the second
530 * channel of the same branch). The fix is to push
531 * edac_mc_handle_error() call into each driver
532 */
533 edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
534 0, 0, 0,
535 csrow, channel0, -1, msg, NULL, NULL);
536}
537
538static inline void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
539 unsigned int csrow,
540 unsigned int channel, char *msg)
541{
542 edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
543 0, 0, 0,
544 csrow, channel, -1, msg, NULL, NULL);
545}
488 546
489/* 547/*
490 * edac_device APIs 548 * edac_device APIs
@@ -496,6 +554,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
496extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, 554extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
497 int inst_nr, int block_nr, const char *msg); 555 int inst_nr, int block_nr, const char *msg);
498extern int edac_device_alloc_index(void); 556extern int edac_device_alloc_index(void);
557extern const char *edac_layer_name[];
499 558
500/* 559/*
501 * edac_pci APIs 560 * edac_pci APIs
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index ff8c0020649c..1bd237ee4ca7 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -44,9 +44,25 @@ static void edac_mc_dump_channel(struct rank_info *chan)
44 debugf4("\tchannel = %p\n", chan); 44 debugf4("\tchannel = %p\n", chan);
45 debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx); 45 debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx);
46 debugf4("\tchannel->csrow = %p\n\n", chan->csrow); 46 debugf4("\tchannel->csrow = %p\n\n", chan->csrow);
47 debugf4("\tdimm->ce_count = %d\n", chan->dimm->ce_count); 47 debugf4("\tchannel->dimm = %p\n", chan->dimm);
48 debugf4("\tdimm->label = '%s'\n", chan->dimm->label); 48}
49 debugf4("\tdimm->nr_pages = 0x%x\n", chan->dimm->nr_pages); 49
50static void edac_mc_dump_dimm(struct dimm_info *dimm)
51{
52 int i;
53
54 debugf4("\tdimm = %p\n", dimm);
55 debugf4("\tdimm->label = '%s'\n", dimm->label);
56 debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
57 debugf4("\tdimm location ");
58 for (i = 0; i < dimm->mci->n_layers; i++) {
59 printk(KERN_CONT "%d", dimm->location[i]);
60 if (i < dimm->mci->n_layers - 1)
61 printk(KERN_CONT ".");
62 }
63 printk(KERN_CONT "\n");
64 debugf4("\tdimm->grain = %d\n", dimm->grain);
65 debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
50} 66}
51 67
52static void edac_mc_dump_csrow(struct csrow_info *csrow) 68static void edac_mc_dump_csrow(struct csrow_info *csrow)
@@ -70,6 +86,8 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci)
70 debugf4("\tmci->edac_check = %p\n", mci->edac_check); 86 debugf4("\tmci->edac_check = %p\n", mci->edac_check);
71 debugf3("\tmci->nr_csrows = %d, csrows = %p\n", 87 debugf3("\tmci->nr_csrows = %d, csrows = %p\n",
72 mci->nr_csrows, mci->csrows); 88 mci->nr_csrows, mci->csrows);
89 debugf3("\tmci->nr_dimms = %d, dimms = %p\n",
90 mci->tot_dimms, mci->dimms);
73 debugf3("\tdev = %p\n", mci->dev); 91 debugf3("\tdev = %p\n", mci->dev);
74 debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); 92 debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name);
75 debugf3("\tpvt_info = %p\n\n", mci->pvt_info); 93 debugf3("\tpvt_info = %p\n\n", mci->pvt_info);
@@ -157,10 +175,12 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems)
157} 175}
158 176
159/** 177/**
160 * edac_mc_alloc: Allocate a struct mem_ctl_info structure 178 * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
161 * @size_pvt: size of private storage needed 179 * @mc_num: Memory controller number
162 * @nr_csrows: Number of CWROWS needed for this MC 180 * @n_layers: Number of MC hierarchy layers
163 * @nr_chans: Number of channels for the MC 181 * layers: Describes each layer as seen by the Memory Controller
182 * @size_pvt: size of private storage needed
183 *
164 * 184 *
165 * Everything is kmalloc'ed as one big chunk - more efficient. 185 * Everything is kmalloc'ed as one big chunk - more efficient.
166 * Only can be used if all structures have the same lifetime - otherwise 186 * Only can be used if all structures have the same lifetime - otherwise
@@ -168,22 +188,49 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems)
168 * 188 *
169 * Use edac_mc_free() to free mc structures allocated by this function. 189 * Use edac_mc_free() to free mc structures allocated by this function.
170 * 190 *
191 * NOTE: drivers handle multi-rank memories in different ways: in some
192 * drivers, one multi-rank memory stick is mapped as one entry, while, in
193 * others, a single multi-rank memory stick would be mapped into several
194 * entries. Currently, this function will allocate multiple struct dimm_info
195 * on such scenarios, as grouping the multiple ranks require drivers change.
196 *
171 * Returns: 197 * Returns:
172 * NULL allocation failed 198 * NULL allocation failed
173 * struct mem_ctl_info pointer 199 * struct mem_ctl_info pointer
174 */ 200 */
175struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, 201struct mem_ctl_info *new_edac_mc_alloc(unsigned mc_num,
176 unsigned nr_chans, int edac_index) 202 unsigned n_layers,
203 struct edac_mc_layer *layers,
204 unsigned sz_pvt)
177{ 205{
178 void *ptr = NULL;
179 struct mem_ctl_info *mci; 206 struct mem_ctl_info *mci;
180 struct csrow_info *csi, *csrow; 207 struct edac_mc_layer *layer;
208 struct csrow_info *csi, *csr;
181 struct rank_info *chi, *chp, *chan; 209 struct rank_info *chi, *chp, *chan;
182 struct dimm_info *dimm; 210 struct dimm_info *dimm;
183 void *pvt; 211 u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
184 unsigned size; 212 unsigned pos[EDAC_MAX_LAYERS];
185 int row, chn; 213 void *pvt, *ptr = NULL;
186 int err; 214 unsigned size, tot_dimms = 1, count = 1;
215 unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
216 int i, j, err, row, chn;
217 bool per_rank = false;
218
219 BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
220 /*
221 * Calculate the total amount of dimms and csrows/cschannels while
222 * in the old API emulation mode
223 */
224 for (i = 0; i < n_layers; i++) {
225 tot_dimms *= layers[i].size;
226 if (layers[i].is_virt_csrow)
227 tot_csrows *= layers[i].size;
228 else
229 tot_channels *= layers[i].size;
230
231 if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
232 per_rank = true;
233 }
187 234
188 /* Figure out the offsets of the various items from the start of an mc 235 /* Figure out the offsets of the various items from the start of an mc
189 * structure. We want the alignment of each item to be at least as 236 * structure. We want the alignment of each item to be at least as
@@ -191,12 +238,27 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
191 * hardcode everything into a single struct. 238 * hardcode everything into a single struct.
192 */ 239 */
193 mci = edac_align_ptr(&ptr, sizeof(*mci), 1); 240 mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
194 csi = edac_align_ptr(&ptr, sizeof(*csi), nr_csrows); 241 layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
195 chi = edac_align_ptr(&ptr, sizeof(*chi), nr_csrows * nr_chans); 242 csi = edac_align_ptr(&ptr, sizeof(*csi), tot_csrows);
196 dimm = edac_align_ptr(&ptr, sizeof(*dimm), nr_csrows * nr_chans); 243 chi = edac_align_ptr(&ptr, sizeof(*chi), tot_csrows * tot_channels);
244 dimm = edac_align_ptr(&ptr, sizeof(*dimm), tot_dimms);
245 for (i = 0; i < n_layers; i++) {
246 count *= layers[i].size;
247 debugf4("%s: errcount layer %d size %d\n", __func__, i, count);
248 ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
249 ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
250 tot_errcount += 2 * count;
251 }
252
253 debugf4("%s: allocating %d error counters\n", __func__, tot_errcount);
197 pvt = edac_align_ptr(&ptr, sz_pvt, 1); 254 pvt = edac_align_ptr(&ptr, sz_pvt, 1);
198 size = ((unsigned long)pvt) + sz_pvt; 255 size = ((unsigned long)pvt) + sz_pvt;
199 256
257 debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
258 __func__, size,
259 tot_dimms,
260 per_rank ? "ranks" : "dimms",
261 tot_csrows * tot_channels);
200 mci = kzalloc(size, GFP_KERNEL); 262 mci = kzalloc(size, GFP_KERNEL);
201 if (mci == NULL) 263 if (mci == NULL)
202 return NULL; 264 return NULL;
@@ -204,42 +266,87 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
204 /* Adjust pointers so they point within the memory we just allocated 266 /* Adjust pointers so they point within the memory we just allocated
205 * rather than an imaginary chunk of memory located at address 0. 267 * rather than an imaginary chunk of memory located at address 0.
206 */ 268 */
269 layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
207 csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); 270 csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi));
208 chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); 271 chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi));
209 dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm)); 272 dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm));
273 for (i = 0; i < n_layers; i++) {
274 mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
275 mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
276 }
210 pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; 277 pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
211 278
212 /* setup index and various internal pointers */ 279 /* setup index and various internal pointers */
213 mci->mc_idx = edac_index; 280 mci->mc_idx = mc_num;
214 mci->csrows = csi; 281 mci->csrows = csi;
215 mci->dimms = dimm; 282 mci->dimms = dimm;
283 mci->tot_dimms = tot_dimms;
216 mci->pvt_info = pvt; 284 mci->pvt_info = pvt;
217 mci->nr_csrows = nr_csrows; 285 mci->n_layers = n_layers;
286 mci->layers = layer;
287 memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
288 mci->nr_csrows = tot_csrows;
289 mci->num_cschannel = tot_channels;
290 mci->mem_is_per_rank = per_rank;
218 291
219 /* 292 /*
220 * For now, assumes that a per-csrow arrangement for dimms. 293 * Fill the csrow struct
221 * This will be latter changed.
222 */ 294 */
223 dimm = mci->dimms; 295 for (row = 0; row < tot_csrows; row++) {
224 296 csr = &csi[row];
225 for (row = 0; row < nr_csrows; row++) { 297 csr->csrow_idx = row;
226 csrow = &csi[row]; 298 csr->mci = mci;
227 csrow->csrow_idx = row; 299 csr->nr_channels = tot_channels;
228 csrow->mci = mci; 300 chp = &chi[row * tot_channels];
229 csrow->nr_channels = nr_chans; 301 csr->channels = chp;
230 chp = &chi[row * nr_chans]; 302
231 csrow->channels = chp; 303 for (chn = 0; chn < tot_channels; chn++) {
232
233 for (chn = 0; chn < nr_chans; chn++) {
234 chan = &chp[chn]; 304 chan = &chp[chn];
235 chan->chan_idx = chn; 305 chan->chan_idx = chn;
236 chan->csrow = csrow; 306 chan->csrow = csr;
307 }
308 }
309
310 /*
311 * Fill the dimm struct
312 */
313 memset(&pos, 0, sizeof(pos));
314 row = 0;
315 chn = 0;
316 debugf4("%s: initializing %d %s\n", __func__, tot_dimms,
317 per_rank ? "ranks" : "dimms");
318 for (i = 0; i < tot_dimms; i++) {
319 chan = &csi[row].channels[chn];
320 dimm = EDAC_DIMM_PTR(layer, mci->dimms, n_layers,
321 pos[0], pos[1], pos[2]);
322 dimm->mci = mci;
323
324 debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__,
325 i, per_rank ? "rank" : "dimm", (dimm - mci->dimms),
326 pos[0], pos[1], pos[2], row, chn);
327
328 /* Copy DIMM location */
329 for (j = 0; j < n_layers; j++)
330 dimm->location[j] = pos[j];
331
332 /* Link it to the csrows old API data */
333 chan->dimm = dimm;
334 dimm->csrow = row;
335 dimm->cschannel = chn;
336
337 /* Increment csrow location */
338 row++;
339 if (row == tot_csrows) {
340 row = 0;
341 chn++;
342 }
237 343
238 mci->csrows[row].channels[chn].dimm = dimm; 344 /* Increment dimm location */
239 dimm->csrow = row; 345 for (j = n_layers - 1; j >= 0; j--) {
240 dimm->csrow_channel = chn; 346 pos[j]++;
241 dimm++; 347 if (pos[j] < layers[j].size)
242 mci->nr_dimms++; 348 break;
349 pos[j] = 0;
243 } 350 }
244 } 351 }
245 352
@@ -263,6 +370,46 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
263 */ 370 */
264 return mci; 371 return mci;
265} 372}
373EXPORT_SYMBOL_GPL(new_edac_mc_alloc);
374
375/**
376 * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
377 * @mc_num: Memory controller number
378 * @n_layers: Number of layers at the MC hierarchy
379 * layers: Describes each layer as seen by the Memory Controller
380 * @size_pvt: Size of private storage needed
381 *
382 *
383 * FIXME: drivers handle multi-rank memories in different ways: some
384 * drivers map multi-ranked DIMMs as one DIMM while others
385 * as several DIMMs.
386 *
387 * Everything is kmalloc'ed as one big chunk - more efficient.
388 * It can only be used if all structures have the same lifetime - otherwise
389 * you have to allocate and initialize your own structures.
390 *
391 * Use edac_mc_free() to free mc structures allocated by this function.
392 *
393 * Returns:
394 * On failure: NULL
395 * On success: struct mem_ctl_info pointer
396 */
397
398struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
399 unsigned nr_chans, int mc_num)
400{
401 unsigned n_layers = 2;
402 struct edac_mc_layer layers[n_layers];
403
404 layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
405 layers[0].size = nr_csrows;
406 layers[0].is_virt_csrow = true;
407 layers[1].type = EDAC_MC_LAYER_CHANNEL;
408 layers[1].size = nr_chans;
409 layers[1].is_virt_csrow = false;
410
411 return new_edac_mc_alloc(mc_num, ARRAY_SIZE(layers), layers, sz_pvt);
412}
266EXPORT_SYMBOL_GPL(edac_mc_alloc); 413EXPORT_SYMBOL_GPL(edac_mc_alloc);
267 414
268/** 415/**
@@ -528,7 +675,6 @@ EXPORT_SYMBOL(edac_mc_find);
528 * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and 675 * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
529 * create sysfs entries associated with mci structure 676 * create sysfs entries associated with mci structure
530 * @mci: pointer to the mci structure to be added to the list 677 * @mci: pointer to the mci structure to be added to the list
531 * @mc_idx: A unique numeric identifier to be assigned to the 'mci' structure.
532 * 678 *
533 * Return: 679 * Return:
534 * 0 Success 680 * 0 Success
@@ -555,6 +701,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
555 edac_mc_dump_channel(&mci->csrows[i]. 701 edac_mc_dump_channel(&mci->csrows[i].
556 channels[j]); 702 channels[j]);
557 } 703 }
704 for (i = 0; i < mci->tot_dimms; i++)
705 edac_mc_dump_dimm(&mci->dimms[i]);
558 } 706 }
559#endif 707#endif
560 mutex_lock(&mem_ctls_mutex); 708 mutex_lock(&mem_ctls_mutex);
@@ -712,261 +860,307 @@ int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
712} 860}
713EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page); 861EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
714 862
715/* FIXME - setable log (warning/emerg) levels */ 863const char *edac_layer_name[] = {
716/* FIXME - integrate with evlog: http://evlog.sourceforge.net/ */ 864 [EDAC_MC_LAYER_BRANCH] = "branch",
717void edac_mc_handle_ce(struct mem_ctl_info *mci, 865 [EDAC_MC_LAYER_CHANNEL] = "channel",
718 unsigned long page_frame_number, 866 [EDAC_MC_LAYER_SLOT] = "slot",
719 unsigned long offset_in_page, unsigned long syndrome, 867 [EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
720 int row, int channel, const char *msg) 868};
869EXPORT_SYMBOL_GPL(edac_layer_name);
870
871static void edac_inc_ce_error(struct mem_ctl_info *mci,
872 bool enable_per_layer_report,
873 const int pos[EDAC_MAX_LAYERS])
721{ 874{
722 unsigned long remapped_page; 875 int i, index = 0;
723 char *label = NULL;
724 u32 grain;
725 876
726 debugf3("MC%d: %s()\n", mci->mc_idx, __func__); 877 mci->ce_count++;
727 878
728 /* FIXME - maybe make panic on INTERNAL ERROR an option */ 879 if (!enable_per_layer_report) {
729 if (row >= mci->nr_csrows || row < 0) { 880 mci->ce_noinfo_count++;
730 /* something is wrong */
731 edac_mc_printk(mci, KERN_ERR,
732 "INTERNAL ERROR: row out of range "
733 "(%d >= %d)\n", row, mci->nr_csrows);
734 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
735 return; 881 return;
736 } 882 }
737 883
738 if (channel >= mci->csrows[row].nr_channels || channel < 0) { 884 for (i = 0; i < mci->n_layers; i++) {
739 /* something is wrong */ 885 if (pos[i] < 0)
740 edac_mc_printk(mci, KERN_ERR, 886 break;
741 "INTERNAL ERROR: channel out of range " 887 index += pos[i];
742 "(%d >= %d)\n", channel, 888 mci->ce_per_layer[i][index]++;
743 mci->csrows[row].nr_channels); 889
744 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); 890 if (i < mci->n_layers - 1)
891 index *= mci->layers[i + 1].size;
892 }
893}
894
895static void edac_inc_ue_error(struct mem_ctl_info *mci,
896 bool enable_per_layer_report,
897 const int pos[EDAC_MAX_LAYERS])
898{
899 int i, index = 0;
900
901 mci->ue_count++;
902
903 if (!enable_per_layer_report) {
904 mci->ce_noinfo_count++;
745 return; 905 return;
746 } 906 }
747 907
748 label = mci->csrows[row].channels[channel].dimm->label; 908 for (i = 0; i < mci->n_layers; i++) {
749 grain = mci->csrows[row].channels[channel].dimm->grain; 909 if (pos[i] < 0)
910 break;
911 index += pos[i];
912 mci->ue_per_layer[i][index]++;
750 913
751 if (edac_mc_get_log_ce()) 914 if (i < mci->n_layers - 1)
752 /* FIXME - put in DIMM location */ 915 index *= mci->layers[i + 1].size;
753 edac_mc_printk(mci, KERN_WARNING, 916 }
754 "CE page 0x%lx, offset 0x%lx, grain %d, syndrome " 917}
755 "0x%lx, row %d, channel %d, label \"%s\": %s\n",
756 page_frame_number, offset_in_page,
757 grain, syndrome, row, channel,
758 label, msg);
759 918
760 mci->ce_count++; 919static void edac_ce_error(struct mem_ctl_info *mci,
761 mci->csrows[row].ce_count++; 920 const int pos[EDAC_MAX_LAYERS],
762 mci->csrows[row].channels[channel].dimm->ce_count++; 921 const char *msg,
763 mci->csrows[row].channels[channel].ce_count++; 922 const char *location,
923 const char *label,
924 const char *detail,
925 const char *other_detail,
926 const bool enable_per_layer_report,
927 const unsigned long page_frame_number,
928 const unsigned long offset_in_page,
929 u32 grain)
930{
931 unsigned long remapped_page;
932
933 if (edac_mc_get_log_ce()) {
934 if (other_detail && *other_detail)
935 edac_mc_printk(mci, KERN_WARNING,
936 "CE %s on %s (%s%s - %s)\n",
937 msg, label, location,
938 detail, other_detail);
939 else
940 edac_mc_printk(mci, KERN_WARNING,
941 "CE %s on %s (%s%s)\n",
942 msg, label, location,
943 detail);
944 }
945 edac_inc_ce_error(mci, enable_per_layer_report, pos);
764 946
765 if (mci->scrub_mode & SCRUB_SW_SRC) { 947 if (mci->scrub_mode & SCRUB_SW_SRC) {
766 /* 948 /*
767 * Some MC's can remap memory so that it is still available 949 * Some memory controllers (called MCs below) can remap
768 * at a different address when PCI devices map into memory. 950 * memory so that it is still available at a different
769 * MC's that can't do this lose the memory where PCI devices 951 * address when PCI devices map into memory.
770 * are mapped. This mapping is MC dependent and so we call 952 * MC's that can't do this, lose the memory where PCI
771 * back into the MC driver for it to map the MC page to 953 * devices are mapped. This mapping is MC-dependent
772 * a physical (CPU) page which can then be mapped to a virtual 954 * and so we call back into the MC driver for it to
773 * page - which can then be scrubbed. 955 * map the MC page to a physical (CPU) page which can
774 */ 956 * then be mapped to a virtual page - which can then
957 * be scrubbed.
958 */
775 remapped_page = mci->ctl_page_to_phys ? 959 remapped_page = mci->ctl_page_to_phys ?
776 mci->ctl_page_to_phys(mci, page_frame_number) : 960 mci->ctl_page_to_phys(mci, page_frame_number) :
777 page_frame_number; 961 page_frame_number;
778 962
779 edac_mc_scrub_block(remapped_page, offset_in_page, grain); 963 edac_mc_scrub_block(remapped_page,
964 offset_in_page, grain);
780 } 965 }
781} 966}
782EXPORT_SYMBOL_GPL(edac_mc_handle_ce);
783 967
784void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg) 968static void edac_ue_error(struct mem_ctl_info *mci,
969 const int pos[EDAC_MAX_LAYERS],
970 const char *msg,
971 const char *location,
972 const char *label,
973 const char *detail,
974 const char *other_detail,
975 const bool enable_per_layer_report)
785{ 976{
786 if (edac_mc_get_log_ce()) 977 if (edac_mc_get_log_ue()) {
787 edac_mc_printk(mci, KERN_WARNING, 978 if (other_detail && *other_detail)
788 "CE - no information available: %s\n", msg); 979 edac_mc_printk(mci, KERN_WARNING,
980 "UE %s on %s (%s%s - %s)\n",
981 msg, label, location, detail,
982 other_detail);
983 else
984 edac_mc_printk(mci, KERN_WARNING,
985 "UE %s on %s (%s%s)\n",
986 msg, label, location, detail);
987 }
789 988
790 mci->ce_noinfo_count++; 989 if (edac_mc_get_panic_on_ue()) {
791 mci->ce_count++; 990 if (other_detail && *other_detail)
991 panic("UE %s on %s (%s%s - %s)\n",
992 msg, label, location, detail, other_detail);
993 else
994 panic("UE %s on %s (%s%s)\n",
995 msg, label, location, detail);
996 }
997
998 edac_inc_ue_error(mci, enable_per_layer_report, pos);
792} 999}
793EXPORT_SYMBOL_GPL(edac_mc_handle_ce_no_info);
794 1000
795void edac_mc_handle_ue(struct mem_ctl_info *mci, 1001#define OTHER_LABEL " or "
796 unsigned long page_frame_number, 1002void edac_mc_handle_error(const enum hw_event_mc_err_type type,
797 unsigned long offset_in_page, int row, const char *msg) 1003 struct mem_ctl_info *mci,
1004 const unsigned long page_frame_number,
1005 const unsigned long offset_in_page,
1006 const unsigned long syndrome,
1007 const int layer0,
1008 const int layer1,
1009 const int layer2,
1010 const char *msg,
1011 const char *other_detail,
1012 const void *mcelog)
798{ 1013{
799 int len = EDAC_MC_LABEL_LEN * 4; 1014 /* FIXME: too much for stack: move it to some pre-alocated area */
800 char labels[len + 1]; 1015 char detail[80], location[80];
801 char *pos = labels; 1016 char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
802 int chan; 1017 char *p;
803 int chars; 1018 int row = -1, chan = -1;
804 char *label = NULL; 1019 int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 };
1020 int i;
805 u32 grain; 1021 u32 grain;
1022 bool enable_per_layer_report = false;
806 1023
807 debugf3("MC%d: %s()\n", mci->mc_idx, __func__); 1024 debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
808 1025
809 /* FIXME - maybe make panic on INTERNAL ERROR an option */ 1026 /*
810 if (row >= mci->nr_csrows || row < 0) { 1027 * Check if the event report is consistent and if the memory
811 /* something is wrong */ 1028 * location is known. If it is known, enable_per_layer_report will be
812 edac_mc_printk(mci, KERN_ERR, 1029 * true, the DIMM(s) label info will be filled and the per-layer
813 "INTERNAL ERROR: row out of range " 1030 * error counters will be incremented.
814 "(%d >= %d)\n", row, mci->nr_csrows); 1031 */
815 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 1032 for (i = 0; i < mci->n_layers; i++) {
816 return; 1033 if (pos[i] >= (int)mci->layers[i].size) {
817 } 1034 if (type == HW_EVENT_ERR_CORRECTED)
818 1035 p = "CE";
819 grain = mci->csrows[row].channels[0].dimm->grain; 1036 else
820 label = mci->csrows[row].channels[0].dimm->label; 1037 p = "UE";
821 chars = snprintf(pos, len + 1, "%s", label); 1038
822 len -= chars; 1039 edac_mc_printk(mci, KERN_ERR,
823 pos += chars; 1040 "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
824 1041 edac_layer_name[mci->layers[i].type],
825 for (chan = 1; (chan < mci->csrows[row].nr_channels) && (len > 0); 1042 pos[i], mci->layers[i].size);
826 chan++) { 1043 /*
827 label = mci->csrows[row].channels[chan].dimm->label; 1044 * Instead of just returning it, let's use what's
828 chars = snprintf(pos, len + 1, ":%s", label); 1045 * known about the error. The increment routines and
829 len -= chars; 1046 * the DIMM filter logic will do the right thing by
830 pos += chars; 1047 * pointing the likely damaged DIMMs.
1048 */
1049 pos[i] = -1;
1050 }
1051 if (pos[i] >= 0)
1052 enable_per_layer_report = true;
831 } 1053 }
832 1054
833 if (edac_mc_get_log_ue()) 1055 /*
834 edac_mc_printk(mci, KERN_EMERG, 1056 * Get the dimm label/grain that applies to the match criteria.
835 "UE page 0x%lx, offset 0x%lx, grain %d, row %d, " 1057 * As the error algorithm may not be able to point to just one memory
836 "labels \"%s\": %s\n", page_frame_number, 1058 * stick, the logic here will get all possible labels that could
837 offset_in_page, grain, row, labels, msg); 1059 * pottentially be affected by the error.
838 1060 * On FB-DIMM memory controllers, for uncorrected errors, it is common
839 if (edac_mc_get_panic_on_ue()) 1061 * to have only the MC channel and the MC dimm (also called "branch")
840 panic("EDAC MC%d: UE page 0x%lx, offset 0x%lx, grain %d, " 1062 * but the channel is not known, as the memory is arranged in pairs,
841 "row %d, labels \"%s\": %s\n", mci->mc_idx, 1063 * where each memory belongs to a separate channel within the same
842 page_frame_number, offset_in_page, 1064 * branch.
843 grain, row, labels, msg); 1065 */
844 1066 grain = 0;
845 mci->ue_count++; 1067 p = label;
846 mci->csrows[row].ue_count++; 1068 *p = '\0';
847} 1069 for (i = 0; i < mci->tot_dimms; i++) {
848EXPORT_SYMBOL_GPL(edac_mc_handle_ue); 1070 struct dimm_info *dimm = &mci->dimms[i];
849
850void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg)
851{
852 if (edac_mc_get_panic_on_ue())
853 panic("EDAC MC%d: Uncorrected Error", mci->mc_idx);
854 1071
855 if (edac_mc_get_log_ue()) 1072 if (layer0 >= 0 && layer0 != dimm->location[0])
856 edac_mc_printk(mci, KERN_WARNING, 1073 continue;
857 "UE - no information available: %s\n", msg); 1074 if (layer1 >= 0 && layer1 != dimm->location[1])
858 mci->ue_noinfo_count++; 1075 continue;
859 mci->ue_count++; 1076 if (layer2 >= 0 && layer2 != dimm->location[2])
860} 1077 continue;
861EXPORT_SYMBOL_GPL(edac_mc_handle_ue_no_info);
862 1078
863/************************************************************* 1079 /* get the max grain, over the error match range */
864 * On Fully Buffered DIMM modules, this help function is 1080 if (dimm->grain > grain)
865 * called to process UE events 1081 grain = dimm->grain;
866 */
867void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
868 unsigned int csrow,
869 unsigned int channela,
870 unsigned int channelb, char *msg)
871{
872 int len = EDAC_MC_LABEL_LEN * 4;
873 char labels[len + 1];
874 char *pos = labels;
875 int chars;
876 char *label;
877
878 if (csrow >= mci->nr_csrows) {
879 /* something is wrong */
880 edac_mc_printk(mci, KERN_ERR,
881 "INTERNAL ERROR: row out of range (%d >= %d)\n",
882 csrow, mci->nr_csrows);
883 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
884 return;
885 }
886 1082
887 if (channela >= mci->csrows[csrow].nr_channels) { 1083 /*
888 /* something is wrong */ 1084 * If the error is memory-controller wide, there's no need to
889 edac_mc_printk(mci, KERN_ERR, 1085 * seek for the affected DIMMs because the whole
890 "INTERNAL ERROR: channel-a out of range " 1086 * channel/memory controller/... may be affected.
891 "(%d >= %d)\n", 1087 * Also, don't show errors for empty DIMM slots.
892 channela, mci->csrows[csrow].nr_channels); 1088 */
893 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 1089 if (enable_per_layer_report && dimm->nr_pages) {
894 return; 1090 if (p != label) {
1091 strcpy(p, OTHER_LABEL);
1092 p += strlen(OTHER_LABEL);
1093 }
1094 strcpy(p, dimm->label);
1095 p += strlen(p);
1096 *p = '\0';
1097
1098 /*
1099 * get csrow/channel of the DIMM, in order to allow
1100 * incrementing the compat API counters
1101 */
1102 debugf4("%s: %s csrows map: (%d,%d)\n",
1103 __func__,
1104 mci->mem_is_per_rank ? "rank" : "dimm",
1105 dimm->csrow, dimm->cschannel);
1106
1107 if (row == -1)
1108 row = dimm->csrow;
1109 else if (row >= 0 && row != dimm->csrow)
1110 row = -2;
1111
1112 if (chan == -1)
1113 chan = dimm->cschannel;
1114 else if (chan >= 0 && chan != dimm->cschannel)
1115 chan = -2;
1116 }
895 } 1117 }
896 1118
897 if (channelb >= mci->csrows[csrow].nr_channels) { 1119 if (!enable_per_layer_report) {
898 /* something is wrong */ 1120 strcpy(label, "any memory");
899 edac_mc_printk(mci, KERN_ERR, 1121 } else {
900 "INTERNAL ERROR: channel-b out of range " 1122 debugf4("%s: csrow/channel to increment: (%d,%d)\n",
901 "(%d >= %d)\n", 1123 __func__, row, chan);
902 channelb, mci->csrows[csrow].nr_channels); 1124 if (p == label)
903 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 1125 strcpy(label, "unknown memory");
904 return; 1126 if (type == HW_EVENT_ERR_CORRECTED) {
1127 if (row >= 0) {
1128 mci->csrows[row].ce_count++;
1129 if (chan >= 0)
1130 mci->csrows[row].channels[chan].ce_count++;
1131 }
1132 } else
1133 if (row >= 0)
1134 mci->csrows[row].ue_count++;
905 } 1135 }
906 1136
907 mci->ue_count++; 1137 /* Fill the RAM location data */
908 mci->csrows[csrow].ue_count++; 1138 p = location;
909 1139 for (i = 0; i < mci->n_layers; i++) {
910 /* Generate the DIMM labels from the specified channels */ 1140 if (pos[i] < 0)
911 label = mci->csrows[csrow].channels[channela].dimm->label; 1141 continue;
912 chars = snprintf(pos, len + 1, "%s", label);
913 len -= chars;
914 pos += chars;
915
916 chars = snprintf(pos, len + 1, "-%s",
917 mci->csrows[csrow].channels[channelb].dimm->label);
918
919 if (edac_mc_get_log_ue())
920 edac_mc_printk(mci, KERN_EMERG,
921 "UE row %d, channel-a= %d channel-b= %d "
922 "labels \"%s\": %s\n", csrow, channela, channelb,
923 labels, msg);
924
925 if (edac_mc_get_panic_on_ue())
926 panic("UE row %d, channel-a= %d channel-b= %d "
927 "labels \"%s\": %s\n", csrow, channela,
928 channelb, labels, msg);
929}
930EXPORT_SYMBOL(edac_mc_handle_fbd_ue);
931
932/*************************************************************
933 * On Fully Buffered DIMM modules, this help function is
934 * called to process CE events
935 */
936void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
937 unsigned int csrow, unsigned int channel, char *msg)
938{
939 char *label = NULL;
940 1142
941 /* Ensure boundary values */ 1143 p += sprintf(p, "%s:%d ",
942 if (csrow >= mci->nr_csrows) { 1144 edac_layer_name[mci->layers[i].type],
943 /* something is wrong */ 1145 pos[i]);
944 edac_mc_printk(mci, KERN_ERR,
945 "INTERNAL ERROR: row out of range (%d >= %d)\n",
946 csrow, mci->nr_csrows);
947 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
948 return;
949 } 1146 }
950 if (channel >= mci->csrows[csrow].nr_channels) {
951 /* something is wrong */
952 edac_mc_printk(mci, KERN_ERR,
953 "INTERNAL ERROR: channel out of range (%d >= %d)\n",
954 channel, mci->csrows[csrow].nr_channels);
955 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
956 return;
957 }
958
959 label = mci->csrows[csrow].channels[channel].dimm->label;
960 1147
961 if (edac_mc_get_log_ce()) 1148 /* Memory type dependent details about the error */
962 /* FIXME - put in DIMM location */ 1149 if (type == HW_EVENT_ERR_CORRECTED) {
963 edac_mc_printk(mci, KERN_WARNING, 1150 snprintf(detail, sizeof(detail),
964 "CE row %d, channel %d, label \"%s\": %s\n", 1151 "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx",
965 csrow, channel, label, msg); 1152 page_frame_number, offset_in_page,
1153 grain, syndrome);
1154 edac_ce_error(mci, pos, msg, location, label, detail,
1155 other_detail, enable_per_layer_report,
1156 page_frame_number, offset_in_page, grain);
1157 } else {
1158 snprintf(detail, sizeof(detail),
1159 "page:0x%lx offset:0x%lx grain:%d",
1160 page_frame_number, offset_in_page, grain);
966 1161
967 mci->ce_count++; 1162 edac_ue_error(mci, pos, msg, location, label, detail,
968 mci->csrows[csrow].ce_count++; 1163 other_detail, enable_per_layer_report);
969 mci->csrows[csrow].channels[channel].dimm->ce_count++; 1164 }
970 mci->csrows[csrow].channels[channel].ce_count++;
971} 1165}
972EXPORT_SYMBOL(edac_mc_handle_fbd_ce); 1166EXPORT_SYMBOL_GPL(edac_mc_handle_error);