aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac/edac_mc.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/edac/edac_mc.c')
-rw-r--r--drivers/edac/edac_mc.c716
1 files changed, 470 insertions, 246 deletions
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index feef7733fae7..10f375032e96 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -43,9 +43,26 @@ static void edac_mc_dump_channel(struct rank_info *chan)
43{ 43{
44 debugf4("\tchannel = %p\n", chan); 44 debugf4("\tchannel = %p\n", chan);
45 debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx); 45 debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx);
46 debugf4("\tchannel->ce_count = %d\n", chan->ce_count);
47 debugf4("\tchannel->label = '%s'\n", chan->label);
48 debugf4("\tchannel->csrow = %p\n\n", chan->csrow); 46 debugf4("\tchannel->csrow = %p\n\n", chan->csrow);
47 debugf4("\tchannel->dimm = %p\n", chan->dimm);
48}
49
50static void edac_mc_dump_dimm(struct dimm_info *dimm)
51{
52 int i;
53
54 debugf4("\tdimm = %p\n", dimm);
55 debugf4("\tdimm->label = '%s'\n", dimm->label);
56 debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
57 debugf4("\tdimm location ");
58 for (i = 0; i < dimm->mci->n_layers; i++) {
59 printk(KERN_CONT "%d", dimm->location[i]);
60 if (i < dimm->mci->n_layers - 1)
61 printk(KERN_CONT ".");
62 }
63 printk(KERN_CONT "\n");
64 debugf4("\tdimm->grain = %d\n", dimm->grain);
65 debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
49} 66}
50 67
51static void edac_mc_dump_csrow(struct csrow_info *csrow) 68static void edac_mc_dump_csrow(struct csrow_info *csrow)
@@ -55,7 +72,6 @@ static void edac_mc_dump_csrow(struct csrow_info *csrow)
55 debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page); 72 debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page);
56 debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page); 73 debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page);
57 debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask); 74 debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask);
58 debugf4("\tcsrow->nr_pages = 0x%x\n", csrow->nr_pages);
59 debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels); 75 debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels);
60 debugf4("\tcsrow->channels = %p\n", csrow->channels); 76 debugf4("\tcsrow->channels = %p\n", csrow->channels);
61 debugf4("\tcsrow->mci = %p\n\n", csrow->mci); 77 debugf4("\tcsrow->mci = %p\n\n", csrow->mci);
@@ -70,6 +86,8 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci)
70 debugf4("\tmci->edac_check = %p\n", mci->edac_check); 86 debugf4("\tmci->edac_check = %p\n", mci->edac_check);
71 debugf3("\tmci->nr_csrows = %d, csrows = %p\n", 87 debugf3("\tmci->nr_csrows = %d, csrows = %p\n",
72 mci->nr_csrows, mci->csrows); 88 mci->nr_csrows, mci->csrows);
89 debugf3("\tmci->nr_dimms = %d, dimms = %p\n",
90 mci->tot_dimms, mci->dimms);
73 debugf3("\tdev = %p\n", mci->dev); 91 debugf3("\tdev = %p\n", mci->dev);
74 debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); 92 debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name);
75 debugf3("\tpvt_info = %p\n\n", mci->pvt_info); 93 debugf3("\tpvt_info = %p\n\n", mci->pvt_info);
@@ -101,18 +119,37 @@ const char *edac_mem_types[] = {
101}; 119};
102EXPORT_SYMBOL_GPL(edac_mem_types); 120EXPORT_SYMBOL_GPL(edac_mem_types);
103 121
104/* 'ptr' points to a possibly unaligned item X such that sizeof(X) is 'size'. 122/**
105 * Adjust 'ptr' so that its alignment is at least as stringent as what the 123 * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation
106 * compiler would provide for X and return the aligned result. 124 * @p: pointer to a pointer with the memory offset to be used. At
125 * return, this will be incremented to point to the next offset
126 * @size: Size of the data structure to be reserved
127 * @n_elems: Number of elements that should be reserved
107 * 128 *
108 * If 'size' is a constant, the compiler will optimize this whole function 129 * If 'size' is a constant, the compiler will optimize this whole function
109 * down to either a no-op or the addition of a constant to the value of 'ptr'. 130 * down to either a no-op or the addition of a constant to the value of '*p'.
131 *
132 * The 'p' pointer is absolutely needed to keep the proper advancing
133 * further in memory to the proper offsets when allocating the struct along
134 * with its embedded structs, as edac_device_alloc_ctl_info() does it
135 * above, for example.
136 *
137 * At return, the pointer 'p' will be incremented to be used on a next call
138 * to this function.
110 */ 139 */
111void *edac_align_ptr(void *ptr, unsigned size) 140void *edac_align_ptr(void **p, unsigned size, int n_elems)
112{ 141{
113 unsigned align, r; 142 unsigned align, r;
143 void *ptr = *p;
144
145 *p += size * n_elems;
114 146
115 /* Here we assume that the alignment of a "long long" is the most 147 /*
148 * 'p' can possibly be an unaligned item X such that sizeof(X) is
149 * 'size'. Adjust 'p' so that its alignment is at least as
150 * stringent as what the compiler would provide for X and return
151 * the aligned result.
152 * Here we assume that the alignment of a "long long" is the most
116 * stringent alignment that the compiler will ever provide by default. 153 * stringent alignment that the compiler will ever provide by default.
117 * As far as I know, this is a reasonable assumption. 154 * As far as I know, this is a reasonable assumption.
118 */ 155 */
@@ -132,14 +169,18 @@ void *edac_align_ptr(void *ptr, unsigned size)
132 if (r == 0) 169 if (r == 0)
133 return (char *)ptr; 170 return (char *)ptr;
134 171
172 *p += align - r;
173
135 return (void *)(((unsigned long)ptr) + align - r); 174 return (void *)(((unsigned long)ptr) + align - r);
136} 175}
137 176
138/** 177/**
139 * edac_mc_alloc: Allocate a struct mem_ctl_info structure 178 * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
140 * @size_pvt: size of private storage needed 179 * @mc_num: Memory controller number
141 * @nr_csrows: Number of CWROWS needed for this MC 180 * @n_layers: Number of MC hierarchy layers
142 * @nr_chans: Number of channels for the MC 181 * layers: Describes each layer as seen by the Memory Controller
182 * @size_pvt: size of private storage needed
183 *
143 * 184 *
144 * Everything is kmalloc'ed as one big chunk - more efficient. 185 * Everything is kmalloc'ed as one big chunk - more efficient.
145 * Only can be used if all structures have the same lifetime - otherwise 186 * Only can be used if all structures have the same lifetime - otherwise
@@ -147,32 +188,77 @@ void *edac_align_ptr(void *ptr, unsigned size)
147 * 188 *
148 * Use edac_mc_free() to free mc structures allocated by this function. 189 * Use edac_mc_free() to free mc structures allocated by this function.
149 * 190 *
191 * NOTE: drivers handle multi-rank memories in different ways: in some
192 * drivers, one multi-rank memory stick is mapped as one entry, while, in
193 * others, a single multi-rank memory stick would be mapped into several
194 * entries. Currently, this function will allocate multiple struct dimm_info
195 * on such scenarios, as grouping the multiple ranks require drivers change.
196 *
150 * Returns: 197 * Returns:
151 * NULL allocation failed 198 * On failure: NULL
152 * struct mem_ctl_info pointer 199 * On success: struct mem_ctl_info pointer
153 */ 200 */
154struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, 201struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
155 unsigned nr_chans, int edac_index) 202 unsigned n_layers,
203 struct edac_mc_layer *layers,
204 unsigned sz_pvt)
156{ 205{
157 struct mem_ctl_info *mci; 206 struct mem_ctl_info *mci;
158 struct csrow_info *csi, *csrow; 207 struct edac_mc_layer *layer;
208 struct csrow_info *csi, *csr;
159 struct rank_info *chi, *chp, *chan; 209 struct rank_info *chi, *chp, *chan;
160 void *pvt; 210 struct dimm_info *dimm;
161 unsigned size; 211 u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
162 int row, chn; 212 unsigned pos[EDAC_MAX_LAYERS];
163 int err; 213 unsigned size, tot_dimms = 1, count = 1;
214 unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
215 void *pvt, *p, *ptr = NULL;
216 int i, j, err, row, chn, n, len;
217 bool per_rank = false;
218
219 BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
220 /*
221 * Calculate the total amount of dimms and csrows/cschannels while
222 * in the old API emulation mode
223 */
224 for (i = 0; i < n_layers; i++) {
225 tot_dimms *= layers[i].size;
226 if (layers[i].is_virt_csrow)
227 tot_csrows *= layers[i].size;
228 else
229 tot_channels *= layers[i].size;
230
231 if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
232 per_rank = true;
233 }
164 234
165 /* Figure out the offsets of the various items from the start of an mc 235 /* Figure out the offsets of the various items from the start of an mc
166 * structure. We want the alignment of each item to be at least as 236 * structure. We want the alignment of each item to be at least as
167 * stringent as what the compiler would provide if we could simply 237 * stringent as what the compiler would provide if we could simply
168 * hardcode everything into a single struct. 238 * hardcode everything into a single struct.
169 */ 239 */
170 mci = (struct mem_ctl_info *)0; 240 mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
171 csi = edac_align_ptr(&mci[1], sizeof(*csi)); 241 layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
172 chi = edac_align_ptr(&csi[nr_csrows], sizeof(*chi)); 242 csi = edac_align_ptr(&ptr, sizeof(*csi), tot_csrows);
173 pvt = edac_align_ptr(&chi[nr_chans * nr_csrows], sz_pvt); 243 chi = edac_align_ptr(&ptr, sizeof(*chi), tot_csrows * tot_channels);
244 dimm = edac_align_ptr(&ptr, sizeof(*dimm), tot_dimms);
245 for (i = 0; i < n_layers; i++) {
246 count *= layers[i].size;
247 debugf4("%s: errcount layer %d size %d\n", __func__, i, count);
248 ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
249 ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
250 tot_errcount += 2 * count;
251 }
252
253 debugf4("%s: allocating %d error counters\n", __func__, tot_errcount);
254 pvt = edac_align_ptr(&ptr, sz_pvt, 1);
174 size = ((unsigned long)pvt) + sz_pvt; 255 size = ((unsigned long)pvt) + sz_pvt;
175 256
257 debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
258 __func__, size,
259 tot_dimms,
260 per_rank ? "ranks" : "dimms",
261 tot_csrows * tot_channels);
176 mci = kzalloc(size, GFP_KERNEL); 262 mci = kzalloc(size, GFP_KERNEL);
177 if (mci == NULL) 263 if (mci == NULL)
178 return NULL; 264 return NULL;
@@ -180,28 +266,103 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
180 /* Adjust pointers so they point within the memory we just allocated 266 /* Adjust pointers so they point within the memory we just allocated
181 * rather than an imaginary chunk of memory located at address 0. 267 * rather than an imaginary chunk of memory located at address 0.
182 */ 268 */
269 layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
183 csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); 270 csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi));
184 chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); 271 chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi));
272 dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm));
273 for (i = 0; i < n_layers; i++) {
274 mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
275 mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
276 }
185 pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; 277 pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
186 278
187 /* setup index and various internal pointers */ 279 /* setup index and various internal pointers */
188 mci->mc_idx = edac_index; 280 mci->mc_idx = mc_num;
189 mci->csrows = csi; 281 mci->csrows = csi;
282 mci->dimms = dimm;
283 mci->tot_dimms = tot_dimms;
190 mci->pvt_info = pvt; 284 mci->pvt_info = pvt;
191 mci->nr_csrows = nr_csrows; 285 mci->n_layers = n_layers;
192 286 mci->layers = layer;
193 for (row = 0; row < nr_csrows; row++) { 287 memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
194 csrow = &csi[row]; 288 mci->nr_csrows = tot_csrows;
195 csrow->csrow_idx = row; 289 mci->num_cschannel = tot_channels;
196 csrow->mci = mci; 290 mci->mem_is_per_rank = per_rank;
197 csrow->nr_channels = nr_chans;
198 chp = &chi[row * nr_chans];
199 csrow->channels = chp;
200 291
201 for (chn = 0; chn < nr_chans; chn++) { 292 /*
293 * Fill the csrow struct
294 */
295 for (row = 0; row < tot_csrows; row++) {
296 csr = &csi[row];
297 csr->csrow_idx = row;
298 csr->mci = mci;
299 csr->nr_channels = tot_channels;
300 chp = &chi[row * tot_channels];
301 csr->channels = chp;
302
303 for (chn = 0; chn < tot_channels; chn++) {
202 chan = &chp[chn]; 304 chan = &chp[chn];
203 chan->chan_idx = chn; 305 chan->chan_idx = chn;
204 chan->csrow = csrow; 306 chan->csrow = csr;
307 }
308 }
309
310 /*
311 * Fill the dimm struct
312 */
313 memset(&pos, 0, sizeof(pos));
314 row = 0;
315 chn = 0;
316 debugf4("%s: initializing %d %s\n", __func__, tot_dimms,
317 per_rank ? "ranks" : "dimms");
318 for (i = 0; i < tot_dimms; i++) {
319 chan = &csi[row].channels[chn];
320 dimm = EDAC_DIMM_PTR(layer, mci->dimms, n_layers,
321 pos[0], pos[1], pos[2]);
322 dimm->mci = mci;
323
324 debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__,
325 i, per_rank ? "rank" : "dimm", (dimm - mci->dimms),
326 pos[0], pos[1], pos[2], row, chn);
327
328 /*
329 * Copy DIMM location and initialize it.
330 */
331 len = sizeof(dimm->label);
332 p = dimm->label;
333 n = snprintf(p, len, "mc#%u", mc_num);
334 p += n;
335 len -= n;
336 for (j = 0; j < n_layers; j++) {
337 n = snprintf(p, len, "%s#%u",
338 edac_layer_name[layers[j].type],
339 pos[j]);
340 p += n;
341 len -= n;
342 dimm->location[j] = pos[j];
343
344 if (len <= 0)
345 break;
346 }
347
348 /* Link it to the csrows old API data */
349 chan->dimm = dimm;
350 dimm->csrow = row;
351 dimm->cschannel = chn;
352
353 /* Increment csrow location */
354 row++;
355 if (row == tot_csrows) {
356 row = 0;
357 chn++;
358 }
359
360 /* Increment dimm location */
361 for (j = n_layers - 1; j >= 0; j--) {
362 pos[j]++;
363 if (pos[j] < layers[j].size)
364 break;
365 pos[j] = 0;
205 } 366 }
206 } 367 }
207 368
@@ -490,7 +651,6 @@ EXPORT_SYMBOL(edac_mc_find);
490 * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and 651 * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
491 * create sysfs entries associated with mci structure 652 * create sysfs entries associated with mci structure
492 * @mci: pointer to the mci structure to be added to the list 653 * @mci: pointer to the mci structure to be added to the list
493 * @mc_idx: A unique numeric identifier to be assigned to the 'mci' structure.
494 * 654 *
495 * Return: 655 * Return:
496 * 0 Success 656 * 0 Success
@@ -517,6 +677,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
517 edac_mc_dump_channel(&mci->csrows[i]. 677 edac_mc_dump_channel(&mci->csrows[i].
518 channels[j]); 678 channels[j]);
519 } 679 }
680 for (i = 0; i < mci->tot_dimms; i++)
681 edac_mc_dump_dimm(&mci->dimms[i]);
520 } 682 }
521#endif 683#endif
522 mutex_lock(&mem_ctls_mutex); 684 mutex_lock(&mem_ctls_mutex);
@@ -636,15 +798,19 @@ static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
636int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) 798int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
637{ 799{
638 struct csrow_info *csrows = mci->csrows; 800 struct csrow_info *csrows = mci->csrows;
639 int row, i; 801 int row, i, j, n;
640 802
641 debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page); 803 debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page);
642 row = -1; 804 row = -1;
643 805
644 for (i = 0; i < mci->nr_csrows; i++) { 806 for (i = 0; i < mci->nr_csrows; i++) {
645 struct csrow_info *csrow = &csrows[i]; 807 struct csrow_info *csrow = &csrows[i];
646 808 n = 0;
647 if (csrow->nr_pages == 0) 809 for (j = 0; j < csrow->nr_channels; j++) {
810 struct dimm_info *dimm = csrow->channels[j].dimm;
811 n += dimm->nr_pages;
812 }
813 if (n == 0)
648 continue; 814 continue;
649 815
650 debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) " 816 debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) "
@@ -670,249 +836,307 @@ int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page)
670} 836}
671EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page); 837EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
672 838
673/* FIXME - setable log (warning/emerg) levels */ 839const char *edac_layer_name[] = {
674/* FIXME - integrate with evlog: http://evlog.sourceforge.net/ */ 840 [EDAC_MC_LAYER_BRANCH] = "branch",
675void edac_mc_handle_ce(struct mem_ctl_info *mci, 841 [EDAC_MC_LAYER_CHANNEL] = "channel",
676 unsigned long page_frame_number, 842 [EDAC_MC_LAYER_SLOT] = "slot",
677 unsigned long offset_in_page, unsigned long syndrome, 843 [EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
678 int row, int channel, const char *msg) 844};
679{ 845EXPORT_SYMBOL_GPL(edac_layer_name);
680 unsigned long remapped_page;
681 846
682 debugf3("MC%d: %s()\n", mci->mc_idx, __func__); 847static void edac_inc_ce_error(struct mem_ctl_info *mci,
848 bool enable_per_layer_report,
849 const int pos[EDAC_MAX_LAYERS])
850{
851 int i, index = 0;
683 852
684 /* FIXME - maybe make panic on INTERNAL ERROR an option */ 853 mci->ce_mc++;
685 if (row >= mci->nr_csrows || row < 0) {
686 /* something is wrong */
687 edac_mc_printk(mci, KERN_ERR,
688 "INTERNAL ERROR: row out of range "
689 "(%d >= %d)\n", row, mci->nr_csrows);
690 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
691 return;
692 }
693 854
694 if (channel >= mci->csrows[row].nr_channels || channel < 0) { 855 if (!enable_per_layer_report) {
695 /* something is wrong */ 856 mci->ce_noinfo_count++;
696 edac_mc_printk(mci, KERN_ERR,
697 "INTERNAL ERROR: channel out of range "
698 "(%d >= %d)\n", channel,
699 mci->csrows[row].nr_channels);
700 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
701 return; 857 return;
702 } 858 }
703 859
704 if (edac_mc_get_log_ce()) 860 for (i = 0; i < mci->n_layers; i++) {
705 /* FIXME - put in DIMM location */ 861 if (pos[i] < 0)
706 edac_mc_printk(mci, KERN_WARNING, 862 break;
707 "CE page 0x%lx, offset 0x%lx, grain %d, syndrome " 863 index += pos[i];
708 "0x%lx, row %d, channel %d, label \"%s\": %s\n", 864 mci->ce_per_layer[i][index]++;
709 page_frame_number, offset_in_page,
710 mci->csrows[row].grain, syndrome, row, channel,
711 mci->csrows[row].channels[channel].label, msg);
712
713 mci->ce_count++;
714 mci->csrows[row].ce_count++;
715 mci->csrows[row].channels[channel].ce_count++;
716
717 if (mci->scrub_mode & SCRUB_SW_SRC) {
718 /*
719 * Some MC's can remap memory so that it is still available
720 * at a different address when PCI devices map into memory.
721 * MC's that can't do this lose the memory where PCI devices
722 * are mapped. This mapping is MC dependent and so we call
723 * back into the MC driver for it to map the MC page to
724 * a physical (CPU) page which can then be mapped to a virtual
725 * page - which can then be scrubbed.
726 */
727 remapped_page = mci->ctl_page_to_phys ?
728 mci->ctl_page_to_phys(mci, page_frame_number) :
729 page_frame_number;
730 865
731 edac_mc_scrub_block(remapped_page, offset_in_page, 866 if (i < mci->n_layers - 1)
732 mci->csrows[row].grain); 867 index *= mci->layers[i + 1].size;
733 } 868 }
734} 869}
735EXPORT_SYMBOL_GPL(edac_mc_handle_ce);
736 870
737void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg) 871static void edac_inc_ue_error(struct mem_ctl_info *mci,
872 bool enable_per_layer_report,
873 const int pos[EDAC_MAX_LAYERS])
738{ 874{
739 if (edac_mc_get_log_ce()) 875 int i, index = 0;
740 edac_mc_printk(mci, KERN_WARNING,
741 "CE - no information available: %s\n", msg);
742 876
743 mci->ce_noinfo_count++; 877 mci->ue_mc++;
744 mci->ce_count++;
745}
746EXPORT_SYMBOL_GPL(edac_mc_handle_ce_no_info);
747 878
748void edac_mc_handle_ue(struct mem_ctl_info *mci, 879 if (!enable_per_layer_report) {
749 unsigned long page_frame_number, 880 mci->ce_noinfo_count++;
750 unsigned long offset_in_page, int row, const char *msg)
751{
752 int len = EDAC_MC_LABEL_LEN * 4;
753 char labels[len + 1];
754 char *pos = labels;
755 int chan;
756 int chars;
757
758 debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
759
760 /* FIXME - maybe make panic on INTERNAL ERROR an option */
761 if (row >= mci->nr_csrows || row < 0) {
762 /* something is wrong */
763 edac_mc_printk(mci, KERN_ERR,
764 "INTERNAL ERROR: row out of range "
765 "(%d >= %d)\n", row, mci->nr_csrows);
766 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
767 return; 881 return;
768 } 882 }
769 883
770 chars = snprintf(pos, len + 1, "%s", 884 for (i = 0; i < mci->n_layers; i++) {
771 mci->csrows[row].channels[0].label); 885 if (pos[i] < 0)
772 len -= chars; 886 break;
773 pos += chars; 887 index += pos[i];
888 mci->ue_per_layer[i][index]++;
774 889
775 for (chan = 1; (chan < mci->csrows[row].nr_channels) && (len > 0); 890 if (i < mci->n_layers - 1)
776 chan++) { 891 index *= mci->layers[i + 1].size;
777 chars = snprintf(pos, len + 1, ":%s",
778 mci->csrows[row].channels[chan].label);
779 len -= chars;
780 pos += chars;
781 } 892 }
893}
782 894
783 if (edac_mc_get_log_ue()) 895static void edac_ce_error(struct mem_ctl_info *mci,
784 edac_mc_printk(mci, KERN_EMERG, 896 const int pos[EDAC_MAX_LAYERS],
785 "UE page 0x%lx, offset 0x%lx, grain %d, row %d, " 897 const char *msg,
786 "labels \"%s\": %s\n", page_frame_number, 898 const char *location,
787 offset_in_page, mci->csrows[row].grain, row, 899 const char *label,
788 labels, msg); 900 const char *detail,
901 const char *other_detail,
902 const bool enable_per_layer_report,
903 const unsigned long page_frame_number,
904 const unsigned long offset_in_page,
905 u32 grain)
906{
907 unsigned long remapped_page;
789 908
790 if (edac_mc_get_panic_on_ue()) 909 if (edac_mc_get_log_ce()) {
791 panic("EDAC MC%d: UE page 0x%lx, offset 0x%lx, grain %d, " 910 if (other_detail && *other_detail)
792 "row %d, labels \"%s\": %s\n", mci->mc_idx, 911 edac_mc_printk(mci, KERN_WARNING,
793 page_frame_number, offset_in_page, 912 "CE %s on %s (%s%s - %s)\n",
794 mci->csrows[row].grain, row, labels, msg); 913 msg, label, location,
914 detail, other_detail);
915 else
916 edac_mc_printk(mci, KERN_WARNING,
917 "CE %s on %s (%s%s)\n",
918 msg, label, location,
919 detail);
920 }
921 edac_inc_ce_error(mci, enable_per_layer_report, pos);
795 922
796 mci->ue_count++; 923 if (mci->scrub_mode & SCRUB_SW_SRC) {
797 mci->csrows[row].ue_count++; 924 /*
925 * Some memory controllers (called MCs below) can remap
926 * memory so that it is still available at a different
927 * address when PCI devices map into memory.
928 * MC's that can't do this, lose the memory where PCI
929 * devices are mapped. This mapping is MC-dependent
930 * and so we call back into the MC driver for it to
931 * map the MC page to a physical (CPU) page which can
932 * then be mapped to a virtual page - which can then
933 * be scrubbed.
934 */
935 remapped_page = mci->ctl_page_to_phys ?
936 mci->ctl_page_to_phys(mci, page_frame_number) :
937 page_frame_number;
938
939 edac_mc_scrub_block(remapped_page,
940 offset_in_page, grain);
941 }
798} 942}
799EXPORT_SYMBOL_GPL(edac_mc_handle_ue);
800 943
801void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg) 944static void edac_ue_error(struct mem_ctl_info *mci,
945 const int pos[EDAC_MAX_LAYERS],
946 const char *msg,
947 const char *location,
948 const char *label,
949 const char *detail,
950 const char *other_detail,
951 const bool enable_per_layer_report)
802{ 952{
803 if (edac_mc_get_panic_on_ue()) 953 if (edac_mc_get_log_ue()) {
804 panic("EDAC MC%d: Uncorrected Error", mci->mc_idx); 954 if (other_detail && *other_detail)
955 edac_mc_printk(mci, KERN_WARNING,
956 "UE %s on %s (%s%s - %s)\n",
957 msg, label, location, detail,
958 other_detail);
959 else
960 edac_mc_printk(mci, KERN_WARNING,
961 "UE %s on %s (%s%s)\n",
962 msg, label, location, detail);
963 }
805 964
806 if (edac_mc_get_log_ue()) 965 if (edac_mc_get_panic_on_ue()) {
807 edac_mc_printk(mci, KERN_WARNING, 966 if (other_detail && *other_detail)
808 "UE - no information available: %s\n", msg); 967 panic("UE %s on %s (%s%s - %s)\n",
809 mci->ue_noinfo_count++; 968 msg, label, location, detail, other_detail);
810 mci->ue_count++; 969 else
970 panic("UE %s on %s (%s%s)\n",
971 msg, label, location, detail);
972 }
973
974 edac_inc_ue_error(mci, enable_per_layer_report, pos);
811} 975}
812EXPORT_SYMBOL_GPL(edac_mc_handle_ue_no_info);
813 976
814/************************************************************* 977#define OTHER_LABEL " or "
815 * On Fully Buffered DIMM modules, this help function is 978void edac_mc_handle_error(const enum hw_event_mc_err_type type,
816 * called to process UE events 979 struct mem_ctl_info *mci,
817 */ 980 const unsigned long page_frame_number,
818void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, 981 const unsigned long offset_in_page,
819 unsigned int csrow, 982 const unsigned long syndrome,
820 unsigned int channela, 983 const int layer0,
821 unsigned int channelb, char *msg) 984 const int layer1,
985 const int layer2,
986 const char *msg,
987 const char *other_detail,
988 const void *mcelog)
822{ 989{
823 int len = EDAC_MC_LABEL_LEN * 4; 990 /* FIXME: too much for stack: move it to some pre-alocated area */
824 char labels[len + 1]; 991 char detail[80], location[80];
825 char *pos = labels; 992 char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
826 int chars; 993 char *p;
994 int row = -1, chan = -1;
995 int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 };
996 int i;
997 u32 grain;
998 bool enable_per_layer_report = false;
827 999
828 if (csrow >= mci->nr_csrows) { 1000 debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
829 /* something is wrong */
830 edac_mc_printk(mci, KERN_ERR,
831 "INTERNAL ERROR: row out of range (%d >= %d)\n",
832 csrow, mci->nr_csrows);
833 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
834 return;
835 }
836 1001
837 if (channela >= mci->csrows[csrow].nr_channels) { 1002 /*
838 /* something is wrong */ 1003 * Check if the event report is consistent and if the memory
839 edac_mc_printk(mci, KERN_ERR, 1004 * location is known. If it is known, enable_per_layer_report will be
840 "INTERNAL ERROR: channel-a out of range " 1005 * true, the DIMM(s) label info will be filled and the per-layer
841 "(%d >= %d)\n", 1006 * error counters will be incremented.
842 channela, mci->csrows[csrow].nr_channels); 1007 */
843 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 1008 for (i = 0; i < mci->n_layers; i++) {
844 return; 1009 if (pos[i] >= (int)mci->layers[i].size) {
1010 if (type == HW_EVENT_ERR_CORRECTED)
1011 p = "CE";
1012 else
1013 p = "UE";
1014
1015 edac_mc_printk(mci, KERN_ERR,
1016 "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
1017 edac_layer_name[mci->layers[i].type],
1018 pos[i], mci->layers[i].size);
1019 /*
1020 * Instead of just returning it, let's use what's
1021 * known about the error. The increment routines and
1022 * the DIMM filter logic will do the right thing by
1023 * pointing the likely damaged DIMMs.
1024 */
1025 pos[i] = -1;
1026 }
1027 if (pos[i] >= 0)
1028 enable_per_layer_report = true;
845 } 1029 }
846 1030
847 if (channelb >= mci->csrows[csrow].nr_channels) { 1031 /*
848 /* something is wrong */ 1032 * Get the dimm label/grain that applies to the match criteria.
849 edac_mc_printk(mci, KERN_ERR, 1033 * As the error algorithm may not be able to point to just one memory
850 "INTERNAL ERROR: channel-b out of range " 1034 * stick, the logic here will get all possible labels that could
851 "(%d >= %d)\n", 1035 * pottentially be affected by the error.
852 channelb, mci->csrows[csrow].nr_channels); 1036 * On FB-DIMM memory controllers, for uncorrected errors, it is common
853 edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); 1037 * to have only the MC channel and the MC dimm (also called "branch")
854 return; 1038 * but the channel is not known, as the memory is arranged in pairs,
855 } 1039 * where each memory belongs to a separate channel within the same
1040 * branch.
1041 */
1042 grain = 0;
1043 p = label;
1044 *p = '\0';
1045 for (i = 0; i < mci->tot_dimms; i++) {
1046 struct dimm_info *dimm = &mci->dimms[i];
856 1047
857 mci->ue_count++; 1048 if (layer0 >= 0 && layer0 != dimm->location[0])
858 mci->csrows[csrow].ue_count++; 1049 continue;
1050 if (layer1 >= 0 && layer1 != dimm->location[1])
1051 continue;
1052 if (layer2 >= 0 && layer2 != dimm->location[2])
1053 continue;
859 1054
860 /* Generate the DIMM labels from the specified channels */ 1055 /* get the max grain, over the error match range */
861 chars = snprintf(pos, len + 1, "%s", 1056 if (dimm->grain > grain)
862 mci->csrows[csrow].channels[channela].label); 1057 grain = dimm->grain;
863 len -= chars;
864 pos += chars;
865 chars = snprintf(pos, len + 1, "-%s",
866 mci->csrows[csrow].channels[channelb].label);
867 1058
868 if (edac_mc_get_log_ue()) 1059 /*
869 edac_mc_printk(mci, KERN_EMERG, 1060 * If the error is memory-controller wide, there's no need to
870 "UE row %d, channel-a= %d channel-b= %d " 1061 * seek for the affected DIMMs because the whole
871 "labels \"%s\": %s\n", csrow, channela, channelb, 1062 * channel/memory controller/... may be affected.
872 labels, msg); 1063 * Also, don't show errors for empty DIMM slots.
1064 */
1065 if (enable_per_layer_report && dimm->nr_pages) {
1066 if (p != label) {
1067 strcpy(p, OTHER_LABEL);
1068 p += strlen(OTHER_LABEL);
1069 }
1070 strcpy(p, dimm->label);
1071 p += strlen(p);
1072 *p = '\0';
1073
1074 /*
1075 * get csrow/channel of the DIMM, in order to allow
1076 * incrementing the compat API counters
1077 */
1078 debugf4("%s: %s csrows map: (%d,%d)\n",
1079 __func__,
1080 mci->mem_is_per_rank ? "rank" : "dimm",
1081 dimm->csrow, dimm->cschannel);
1082
1083 if (row == -1)
1084 row = dimm->csrow;
1085 else if (row >= 0 && row != dimm->csrow)
1086 row = -2;
1087
1088 if (chan == -1)
1089 chan = dimm->cschannel;
1090 else if (chan >= 0 && chan != dimm->cschannel)
1091 chan = -2;
1092 }
1093 }
873 1094
874 if (edac_mc_get_panic_on_ue()) 1095 if (!enable_per_layer_report) {
875 panic("UE row %d, channel-a= %d channel-b= %d " 1096 strcpy(label, "any memory");
876 "labels \"%s\": %s\n", csrow, channela, 1097 } else {
877 channelb, labels, msg); 1098 debugf4("%s: csrow/channel to increment: (%d,%d)\n",
878} 1099 __func__, row, chan);
879EXPORT_SYMBOL(edac_mc_handle_fbd_ue); 1100 if (p == label)
1101 strcpy(label, "unknown memory");
1102 if (type == HW_EVENT_ERR_CORRECTED) {
1103 if (row >= 0) {
1104 mci->csrows[row].ce_count++;
1105 if (chan >= 0)
1106 mci->csrows[row].channels[chan].ce_count++;
1107 }
1108 } else
1109 if (row >= 0)
1110 mci->csrows[row].ue_count++;
1111 }
880 1112
881/************************************************************* 1113 /* Fill the RAM location data */
882 * On Fully Buffered DIMM modules, this help function is 1114 p = location;
883 * called to process CE events 1115 for (i = 0; i < mci->n_layers; i++) {
884 */ 1116 if (pos[i] < 0)
885void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, 1117 continue;
886 unsigned int csrow, unsigned int channel, char *msg)
887{
888 1118
889 /* Ensure boundary values */ 1119 p += sprintf(p, "%s:%d ",
890 if (csrow >= mci->nr_csrows) { 1120 edac_layer_name[mci->layers[i].type],
891 /* something is wrong */ 1121 pos[i]);
892 edac_mc_printk(mci, KERN_ERR,
893 "INTERNAL ERROR: row out of range (%d >= %d)\n",
894 csrow, mci->nr_csrows);
895 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
896 return;
897 }
898 if (channel >= mci->csrows[csrow].nr_channels) {
899 /* something is wrong */
900 edac_mc_printk(mci, KERN_ERR,
901 "INTERNAL ERROR: channel out of range (%d >= %d)\n",
902 channel, mci->csrows[csrow].nr_channels);
903 edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
904 return;
905 } 1122 }
906 1123
907 if (edac_mc_get_log_ce()) 1124 /* Memory type dependent details about the error */
908 /* FIXME - put in DIMM location */ 1125 if (type == HW_EVENT_ERR_CORRECTED) {
909 edac_mc_printk(mci, KERN_WARNING, 1126 snprintf(detail, sizeof(detail),
910 "CE row %d, channel %d, label \"%s\": %s\n", 1127 "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx",
911 csrow, channel, 1128 page_frame_number, offset_in_page,
912 mci->csrows[csrow].channels[channel].label, msg); 1129 grain, syndrome);
1130 edac_ce_error(mci, pos, msg, location, label, detail,
1131 other_detail, enable_per_layer_report,
1132 page_frame_number, offset_in_page, grain);
1133 } else {
1134 snprintf(detail, sizeof(detail),
1135 "page:0x%lx offset:0x%lx grain:%d",
1136 page_frame_number, offset_in_page, grain);
913 1137
914 mci->ce_count++; 1138 edac_ue_error(mci, pos, msg, location, label, detail,
915 mci->csrows[csrow].ce_count++; 1139 other_detail, enable_per_layer_report);
916 mci->csrows[csrow].channels[channel].ce_count++; 1140 }
917} 1141}
918EXPORT_SYMBOL(edac_mc_handle_fbd_ce); 1142EXPORT_SYMBOL_GPL(edac_mc_handle_error);