diff options
Diffstat (limited to 'drivers/edac/edac_mc.c')
-rw-r--r-- | drivers/edac/edac_mc.c | 716 |
1 files changed, 470 insertions, 246 deletions
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index feef7733fae7..10f375032e96 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c | |||
@@ -43,9 +43,26 @@ static void edac_mc_dump_channel(struct rank_info *chan) | |||
43 | { | 43 | { |
44 | debugf4("\tchannel = %p\n", chan); | 44 | debugf4("\tchannel = %p\n", chan); |
45 | debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx); | 45 | debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx); |
46 | debugf4("\tchannel->ce_count = %d\n", chan->ce_count); | ||
47 | debugf4("\tchannel->label = '%s'\n", chan->label); | ||
48 | debugf4("\tchannel->csrow = %p\n\n", chan->csrow); | 46 | debugf4("\tchannel->csrow = %p\n\n", chan->csrow); |
47 | debugf4("\tchannel->dimm = %p\n", chan->dimm); | ||
48 | } | ||
49 | |||
50 | static void edac_mc_dump_dimm(struct dimm_info *dimm) | ||
51 | { | ||
52 | int i; | ||
53 | |||
54 | debugf4("\tdimm = %p\n", dimm); | ||
55 | debugf4("\tdimm->label = '%s'\n", dimm->label); | ||
56 | debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages); | ||
57 | debugf4("\tdimm location "); | ||
58 | for (i = 0; i < dimm->mci->n_layers; i++) { | ||
59 | printk(KERN_CONT "%d", dimm->location[i]); | ||
60 | if (i < dimm->mci->n_layers - 1) | ||
61 | printk(KERN_CONT "."); | ||
62 | } | ||
63 | printk(KERN_CONT "\n"); | ||
64 | debugf4("\tdimm->grain = %d\n", dimm->grain); | ||
65 | debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages); | ||
49 | } | 66 | } |
50 | 67 | ||
51 | static void edac_mc_dump_csrow(struct csrow_info *csrow) | 68 | static void edac_mc_dump_csrow(struct csrow_info *csrow) |
@@ -55,7 +72,6 @@ static void edac_mc_dump_csrow(struct csrow_info *csrow) | |||
55 | debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page); | 72 | debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page); |
56 | debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page); | 73 | debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page); |
57 | debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask); | 74 | debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask); |
58 | debugf4("\tcsrow->nr_pages = 0x%x\n", csrow->nr_pages); | ||
59 | debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels); | 75 | debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels); |
60 | debugf4("\tcsrow->channels = %p\n", csrow->channels); | 76 | debugf4("\tcsrow->channels = %p\n", csrow->channels); |
61 | debugf4("\tcsrow->mci = %p\n\n", csrow->mci); | 77 | debugf4("\tcsrow->mci = %p\n\n", csrow->mci); |
@@ -70,6 +86,8 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci) | |||
70 | debugf4("\tmci->edac_check = %p\n", mci->edac_check); | 86 | debugf4("\tmci->edac_check = %p\n", mci->edac_check); |
71 | debugf3("\tmci->nr_csrows = %d, csrows = %p\n", | 87 | debugf3("\tmci->nr_csrows = %d, csrows = %p\n", |
72 | mci->nr_csrows, mci->csrows); | 88 | mci->nr_csrows, mci->csrows); |
89 | debugf3("\tmci->nr_dimms = %d, dimms = %p\n", | ||
90 | mci->tot_dimms, mci->dimms); | ||
73 | debugf3("\tdev = %p\n", mci->dev); | 91 | debugf3("\tdev = %p\n", mci->dev); |
74 | debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); | 92 | debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); |
75 | debugf3("\tpvt_info = %p\n\n", mci->pvt_info); | 93 | debugf3("\tpvt_info = %p\n\n", mci->pvt_info); |
@@ -101,18 +119,37 @@ const char *edac_mem_types[] = { | |||
101 | }; | 119 | }; |
102 | EXPORT_SYMBOL_GPL(edac_mem_types); | 120 | EXPORT_SYMBOL_GPL(edac_mem_types); |
103 | 121 | ||
104 | /* 'ptr' points to a possibly unaligned item X such that sizeof(X) is 'size'. | 122 | /** |
105 | * Adjust 'ptr' so that its alignment is at least as stringent as what the | 123 | * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation |
106 | * compiler would provide for X and return the aligned result. | 124 | * @p: pointer to a pointer with the memory offset to be used. At |
125 | * return, this will be incremented to point to the next offset | ||
126 | * @size: Size of the data structure to be reserved | ||
127 | * @n_elems: Number of elements that should be reserved | ||
107 | * | 128 | * |
108 | * If 'size' is a constant, the compiler will optimize this whole function | 129 | * If 'size' is a constant, the compiler will optimize this whole function |
109 | * down to either a no-op or the addition of a constant to the value of 'ptr'. | 130 | * down to either a no-op or the addition of a constant to the value of '*p'. |
131 | * | ||
132 | * The 'p' pointer is absolutely needed to keep the proper advancing | ||
133 | * further in memory to the proper offsets when allocating the struct along | ||
134 | * with its embedded structs, as edac_device_alloc_ctl_info() does it | ||
135 | * above, for example. | ||
136 | * | ||
137 | * At return, the pointer 'p' will be incremented to be used on a next call | ||
138 | * to this function. | ||
110 | */ | 139 | */ |
111 | void *edac_align_ptr(void *ptr, unsigned size) | 140 | void *edac_align_ptr(void **p, unsigned size, int n_elems) |
112 | { | 141 | { |
113 | unsigned align, r; | 142 | unsigned align, r; |
143 | void *ptr = *p; | ||
144 | |||
145 | *p += size * n_elems; | ||
114 | 146 | ||
115 | /* Here we assume that the alignment of a "long long" is the most | 147 | /* |
148 | * 'p' can possibly be an unaligned item X such that sizeof(X) is | ||
149 | * 'size'. Adjust 'p' so that its alignment is at least as | ||
150 | * stringent as what the compiler would provide for X and return | ||
151 | * the aligned result. | ||
152 | * Here we assume that the alignment of a "long long" is the most | ||
116 | * stringent alignment that the compiler will ever provide by default. | 153 | * stringent alignment that the compiler will ever provide by default. |
117 | * As far as I know, this is a reasonable assumption. | 154 | * As far as I know, this is a reasonable assumption. |
118 | */ | 155 | */ |
@@ -132,14 +169,18 @@ void *edac_align_ptr(void *ptr, unsigned size) | |||
132 | if (r == 0) | 169 | if (r == 0) |
133 | return (char *)ptr; | 170 | return (char *)ptr; |
134 | 171 | ||
172 | *p += align - r; | ||
173 | |||
135 | return (void *)(((unsigned long)ptr) + align - r); | 174 | return (void *)(((unsigned long)ptr) + align - r); |
136 | } | 175 | } |
137 | 176 | ||
138 | /** | 177 | /** |
139 | * edac_mc_alloc: Allocate a struct mem_ctl_info structure | 178 | * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure |
140 | * @size_pvt: size of private storage needed | 179 | * @mc_num: Memory controller number |
141 | * @nr_csrows: Number of CWROWS needed for this MC | 180 | * @n_layers: Number of MC hierarchy layers |
142 | * @nr_chans: Number of channels for the MC | 181 | * layers: Describes each layer as seen by the Memory Controller |
182 | * @size_pvt: size of private storage needed | ||
183 | * | ||
143 | * | 184 | * |
144 | * Everything is kmalloc'ed as one big chunk - more efficient. | 185 | * Everything is kmalloc'ed as one big chunk - more efficient. |
145 | * Only can be used if all structures have the same lifetime - otherwise | 186 | * Only can be used if all structures have the same lifetime - otherwise |
@@ -147,32 +188,77 @@ void *edac_align_ptr(void *ptr, unsigned size) | |||
147 | * | 188 | * |
148 | * Use edac_mc_free() to free mc structures allocated by this function. | 189 | * Use edac_mc_free() to free mc structures allocated by this function. |
149 | * | 190 | * |
191 | * NOTE: drivers handle multi-rank memories in different ways: in some | ||
192 | * drivers, one multi-rank memory stick is mapped as one entry, while, in | ||
193 | * others, a single multi-rank memory stick would be mapped into several | ||
194 | * entries. Currently, this function will allocate multiple struct dimm_info | ||
195 | * on such scenarios, as grouping the multiple ranks require drivers change. | ||
196 | * | ||
150 | * Returns: | 197 | * Returns: |
151 | * NULL allocation failed | 198 | * On failure: NULL |
152 | * struct mem_ctl_info pointer | 199 | * On success: struct mem_ctl_info pointer |
153 | */ | 200 | */ |
154 | struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, | 201 | struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, |
155 | unsigned nr_chans, int edac_index) | 202 | unsigned n_layers, |
203 | struct edac_mc_layer *layers, | ||
204 | unsigned sz_pvt) | ||
156 | { | 205 | { |
157 | struct mem_ctl_info *mci; | 206 | struct mem_ctl_info *mci; |
158 | struct csrow_info *csi, *csrow; | 207 | struct edac_mc_layer *layer; |
208 | struct csrow_info *csi, *csr; | ||
159 | struct rank_info *chi, *chp, *chan; | 209 | struct rank_info *chi, *chp, *chan; |
160 | void *pvt; | 210 | struct dimm_info *dimm; |
161 | unsigned size; | 211 | u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; |
162 | int row, chn; | 212 | unsigned pos[EDAC_MAX_LAYERS]; |
163 | int err; | 213 | unsigned size, tot_dimms = 1, count = 1; |
214 | unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0; | ||
215 | void *pvt, *p, *ptr = NULL; | ||
216 | int i, j, err, row, chn, n, len; | ||
217 | bool per_rank = false; | ||
218 | |||
219 | BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0); | ||
220 | /* | ||
221 | * Calculate the total amount of dimms and csrows/cschannels while | ||
222 | * in the old API emulation mode | ||
223 | */ | ||
224 | for (i = 0; i < n_layers; i++) { | ||
225 | tot_dimms *= layers[i].size; | ||
226 | if (layers[i].is_virt_csrow) | ||
227 | tot_csrows *= layers[i].size; | ||
228 | else | ||
229 | tot_channels *= layers[i].size; | ||
230 | |||
231 | if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) | ||
232 | per_rank = true; | ||
233 | } | ||
164 | 234 | ||
165 | /* Figure out the offsets of the various items from the start of an mc | 235 | /* Figure out the offsets of the various items from the start of an mc |
166 | * structure. We want the alignment of each item to be at least as | 236 | * structure. We want the alignment of each item to be at least as |
167 | * stringent as what the compiler would provide if we could simply | 237 | * stringent as what the compiler would provide if we could simply |
168 | * hardcode everything into a single struct. | 238 | * hardcode everything into a single struct. |
169 | */ | 239 | */ |
170 | mci = (struct mem_ctl_info *)0; | 240 | mci = edac_align_ptr(&ptr, sizeof(*mci), 1); |
171 | csi = edac_align_ptr(&mci[1], sizeof(*csi)); | 241 | layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); |
172 | chi = edac_align_ptr(&csi[nr_csrows], sizeof(*chi)); | 242 | csi = edac_align_ptr(&ptr, sizeof(*csi), tot_csrows); |
173 | pvt = edac_align_ptr(&chi[nr_chans * nr_csrows], sz_pvt); | 243 | chi = edac_align_ptr(&ptr, sizeof(*chi), tot_csrows * tot_channels); |
244 | dimm = edac_align_ptr(&ptr, sizeof(*dimm), tot_dimms); | ||
245 | for (i = 0; i < n_layers; i++) { | ||
246 | count *= layers[i].size; | ||
247 | debugf4("%s: errcount layer %d size %d\n", __func__, i, count); | ||
248 | ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); | ||
249 | ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); | ||
250 | tot_errcount += 2 * count; | ||
251 | } | ||
252 | |||
253 | debugf4("%s: allocating %d error counters\n", __func__, tot_errcount); | ||
254 | pvt = edac_align_ptr(&ptr, sz_pvt, 1); | ||
174 | size = ((unsigned long)pvt) + sz_pvt; | 255 | size = ((unsigned long)pvt) + sz_pvt; |
175 | 256 | ||
257 | debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", | ||
258 | __func__, size, | ||
259 | tot_dimms, | ||
260 | per_rank ? "ranks" : "dimms", | ||
261 | tot_csrows * tot_channels); | ||
176 | mci = kzalloc(size, GFP_KERNEL); | 262 | mci = kzalloc(size, GFP_KERNEL); |
177 | if (mci == NULL) | 263 | if (mci == NULL) |
178 | return NULL; | 264 | return NULL; |
@@ -180,28 +266,103 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, | |||
180 | /* Adjust pointers so they point within the memory we just allocated | 266 | /* Adjust pointers so they point within the memory we just allocated |
181 | * rather than an imaginary chunk of memory located at address 0. | 267 | * rather than an imaginary chunk of memory located at address 0. |
182 | */ | 268 | */ |
269 | layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); | ||
183 | csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); | 270 | csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); |
184 | chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); | 271 | chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); |
272 | dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm)); | ||
273 | for (i = 0; i < n_layers; i++) { | ||
274 | mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i])); | ||
275 | mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i])); | ||
276 | } | ||
185 | pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; | 277 | pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; |
186 | 278 | ||
187 | /* setup index and various internal pointers */ | 279 | /* setup index and various internal pointers */ |
188 | mci->mc_idx = edac_index; | 280 | mci->mc_idx = mc_num; |
189 | mci->csrows = csi; | 281 | mci->csrows = csi; |
282 | mci->dimms = dimm; | ||
283 | mci->tot_dimms = tot_dimms; | ||
190 | mci->pvt_info = pvt; | 284 | mci->pvt_info = pvt; |
191 | mci->nr_csrows = nr_csrows; | 285 | mci->n_layers = n_layers; |
192 | 286 | mci->layers = layer; | |
193 | for (row = 0; row < nr_csrows; row++) { | 287 | memcpy(mci->layers, layers, sizeof(*layer) * n_layers); |
194 | csrow = &csi[row]; | 288 | mci->nr_csrows = tot_csrows; |
195 | csrow->csrow_idx = row; | 289 | mci->num_cschannel = tot_channels; |
196 | csrow->mci = mci; | 290 | mci->mem_is_per_rank = per_rank; |
197 | csrow->nr_channels = nr_chans; | ||
198 | chp = &chi[row * nr_chans]; | ||
199 | csrow->channels = chp; | ||
200 | 291 | ||
201 | for (chn = 0; chn < nr_chans; chn++) { | 292 | /* |
293 | * Fill the csrow struct | ||
294 | */ | ||
295 | for (row = 0; row < tot_csrows; row++) { | ||
296 | csr = &csi[row]; | ||
297 | csr->csrow_idx = row; | ||
298 | csr->mci = mci; | ||
299 | csr->nr_channels = tot_channels; | ||
300 | chp = &chi[row * tot_channels]; | ||
301 | csr->channels = chp; | ||
302 | |||
303 | for (chn = 0; chn < tot_channels; chn++) { | ||
202 | chan = &chp[chn]; | 304 | chan = &chp[chn]; |
203 | chan->chan_idx = chn; | 305 | chan->chan_idx = chn; |
204 | chan->csrow = csrow; | 306 | chan->csrow = csr; |
307 | } | ||
308 | } | ||
309 | |||
310 | /* | ||
311 | * Fill the dimm struct | ||
312 | */ | ||
313 | memset(&pos, 0, sizeof(pos)); | ||
314 | row = 0; | ||
315 | chn = 0; | ||
316 | debugf4("%s: initializing %d %s\n", __func__, tot_dimms, | ||
317 | per_rank ? "ranks" : "dimms"); | ||
318 | for (i = 0; i < tot_dimms; i++) { | ||
319 | chan = &csi[row].channels[chn]; | ||
320 | dimm = EDAC_DIMM_PTR(layer, mci->dimms, n_layers, | ||
321 | pos[0], pos[1], pos[2]); | ||
322 | dimm->mci = mci; | ||
323 | |||
324 | debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__, | ||
325 | i, per_rank ? "rank" : "dimm", (dimm - mci->dimms), | ||
326 | pos[0], pos[1], pos[2], row, chn); | ||
327 | |||
328 | /* | ||
329 | * Copy DIMM location and initialize it. | ||
330 | */ | ||
331 | len = sizeof(dimm->label); | ||
332 | p = dimm->label; | ||
333 | n = snprintf(p, len, "mc#%u", mc_num); | ||
334 | p += n; | ||
335 | len -= n; | ||
336 | for (j = 0; j < n_layers; j++) { | ||
337 | n = snprintf(p, len, "%s#%u", | ||
338 | edac_layer_name[layers[j].type], | ||
339 | pos[j]); | ||
340 | p += n; | ||
341 | len -= n; | ||
342 | dimm->location[j] = pos[j]; | ||
343 | |||
344 | if (len <= 0) | ||
345 | break; | ||
346 | } | ||
347 | |||
348 | /* Link it to the csrows old API data */ | ||
349 | chan->dimm = dimm; | ||
350 | dimm->csrow = row; | ||
351 | dimm->cschannel = chn; | ||
352 | |||
353 | /* Increment csrow location */ | ||
354 | row++; | ||
355 | if (row == tot_csrows) { | ||
356 | row = 0; | ||
357 | chn++; | ||
358 | } | ||
359 | |||
360 | /* Increment dimm location */ | ||
361 | for (j = n_layers - 1; j >= 0; j--) { | ||
362 | pos[j]++; | ||
363 | if (pos[j] < layers[j].size) | ||
364 | break; | ||
365 | pos[j] = 0; | ||
205 | } | 366 | } |
206 | } | 367 | } |
207 | 368 | ||
@@ -490,7 +651,6 @@ EXPORT_SYMBOL(edac_mc_find); | |||
490 | * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and | 651 | * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and |
491 | * create sysfs entries associated with mci structure | 652 | * create sysfs entries associated with mci structure |
492 | * @mci: pointer to the mci structure to be added to the list | 653 | * @mci: pointer to the mci structure to be added to the list |
493 | * @mc_idx: A unique numeric identifier to be assigned to the 'mci' structure. | ||
494 | * | 654 | * |
495 | * Return: | 655 | * Return: |
496 | * 0 Success | 656 | * 0 Success |
@@ -517,6 +677,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci) | |||
517 | edac_mc_dump_channel(&mci->csrows[i]. | 677 | edac_mc_dump_channel(&mci->csrows[i]. |
518 | channels[j]); | 678 | channels[j]); |
519 | } | 679 | } |
680 | for (i = 0; i < mci->tot_dimms; i++) | ||
681 | edac_mc_dump_dimm(&mci->dimms[i]); | ||
520 | } | 682 | } |
521 | #endif | 683 | #endif |
522 | mutex_lock(&mem_ctls_mutex); | 684 | mutex_lock(&mem_ctls_mutex); |
@@ -636,15 +798,19 @@ static void edac_mc_scrub_block(unsigned long page, unsigned long offset, | |||
636 | int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) | 798 | int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) |
637 | { | 799 | { |
638 | struct csrow_info *csrows = mci->csrows; | 800 | struct csrow_info *csrows = mci->csrows; |
639 | int row, i; | 801 | int row, i, j, n; |
640 | 802 | ||
641 | debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page); | 803 | debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page); |
642 | row = -1; | 804 | row = -1; |
643 | 805 | ||
644 | for (i = 0; i < mci->nr_csrows; i++) { | 806 | for (i = 0; i < mci->nr_csrows; i++) { |
645 | struct csrow_info *csrow = &csrows[i]; | 807 | struct csrow_info *csrow = &csrows[i]; |
646 | 808 | n = 0; | |
647 | if (csrow->nr_pages == 0) | 809 | for (j = 0; j < csrow->nr_channels; j++) { |
810 | struct dimm_info *dimm = csrow->channels[j].dimm; | ||
811 | n += dimm->nr_pages; | ||
812 | } | ||
813 | if (n == 0) | ||
648 | continue; | 814 | continue; |
649 | 815 | ||
650 | debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) " | 816 | debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) " |
@@ -670,249 +836,307 @@ int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) | |||
670 | } | 836 | } |
671 | EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page); | 837 | EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page); |
672 | 838 | ||
673 | /* FIXME - setable log (warning/emerg) levels */ | 839 | const char *edac_layer_name[] = { |
674 | /* FIXME - integrate with evlog: http://evlog.sourceforge.net/ */ | 840 | [EDAC_MC_LAYER_BRANCH] = "branch", |
675 | void edac_mc_handle_ce(struct mem_ctl_info *mci, | 841 | [EDAC_MC_LAYER_CHANNEL] = "channel", |
676 | unsigned long page_frame_number, | 842 | [EDAC_MC_LAYER_SLOT] = "slot", |
677 | unsigned long offset_in_page, unsigned long syndrome, | 843 | [EDAC_MC_LAYER_CHIP_SELECT] = "csrow", |
678 | int row, int channel, const char *msg) | 844 | }; |
679 | { | 845 | EXPORT_SYMBOL_GPL(edac_layer_name); |
680 | unsigned long remapped_page; | ||
681 | 846 | ||
682 | debugf3("MC%d: %s()\n", mci->mc_idx, __func__); | 847 | static void edac_inc_ce_error(struct mem_ctl_info *mci, |
848 | bool enable_per_layer_report, | ||
849 | const int pos[EDAC_MAX_LAYERS]) | ||
850 | { | ||
851 | int i, index = 0; | ||
683 | 852 | ||
684 | /* FIXME - maybe make panic on INTERNAL ERROR an option */ | 853 | mci->ce_mc++; |
685 | if (row >= mci->nr_csrows || row < 0) { | ||
686 | /* something is wrong */ | ||
687 | edac_mc_printk(mci, KERN_ERR, | ||
688 | "INTERNAL ERROR: row out of range " | ||
689 | "(%d >= %d)\n", row, mci->nr_csrows); | ||
690 | edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); | ||
691 | return; | ||
692 | } | ||
693 | 854 | ||
694 | if (channel >= mci->csrows[row].nr_channels || channel < 0) { | 855 | if (!enable_per_layer_report) { |
695 | /* something is wrong */ | 856 | mci->ce_noinfo_count++; |
696 | edac_mc_printk(mci, KERN_ERR, | ||
697 | "INTERNAL ERROR: channel out of range " | ||
698 | "(%d >= %d)\n", channel, | ||
699 | mci->csrows[row].nr_channels); | ||
700 | edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); | ||
701 | return; | 857 | return; |
702 | } | 858 | } |
703 | 859 | ||
704 | if (edac_mc_get_log_ce()) | 860 | for (i = 0; i < mci->n_layers; i++) { |
705 | /* FIXME - put in DIMM location */ | 861 | if (pos[i] < 0) |
706 | edac_mc_printk(mci, KERN_WARNING, | 862 | break; |
707 | "CE page 0x%lx, offset 0x%lx, grain %d, syndrome " | 863 | index += pos[i]; |
708 | "0x%lx, row %d, channel %d, label \"%s\": %s\n", | 864 | mci->ce_per_layer[i][index]++; |
709 | page_frame_number, offset_in_page, | ||
710 | mci->csrows[row].grain, syndrome, row, channel, | ||
711 | mci->csrows[row].channels[channel].label, msg); | ||
712 | |||
713 | mci->ce_count++; | ||
714 | mci->csrows[row].ce_count++; | ||
715 | mci->csrows[row].channels[channel].ce_count++; | ||
716 | |||
717 | if (mci->scrub_mode & SCRUB_SW_SRC) { | ||
718 | /* | ||
719 | * Some MC's can remap memory so that it is still available | ||
720 | * at a different address when PCI devices map into memory. | ||
721 | * MC's that can't do this lose the memory where PCI devices | ||
722 | * are mapped. This mapping is MC dependent and so we call | ||
723 | * back into the MC driver for it to map the MC page to | ||
724 | * a physical (CPU) page which can then be mapped to a virtual | ||
725 | * page - which can then be scrubbed. | ||
726 | */ | ||
727 | remapped_page = mci->ctl_page_to_phys ? | ||
728 | mci->ctl_page_to_phys(mci, page_frame_number) : | ||
729 | page_frame_number; | ||
730 | 865 | ||
731 | edac_mc_scrub_block(remapped_page, offset_in_page, | 866 | if (i < mci->n_layers - 1) |
732 | mci->csrows[row].grain); | 867 | index *= mci->layers[i + 1].size; |
733 | } | 868 | } |
734 | } | 869 | } |
735 | EXPORT_SYMBOL_GPL(edac_mc_handle_ce); | ||
736 | 870 | ||
737 | void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg) | 871 | static void edac_inc_ue_error(struct mem_ctl_info *mci, |
872 | bool enable_per_layer_report, | ||
873 | const int pos[EDAC_MAX_LAYERS]) | ||
738 | { | 874 | { |
739 | if (edac_mc_get_log_ce()) | 875 | int i, index = 0; |
740 | edac_mc_printk(mci, KERN_WARNING, | ||
741 | "CE - no information available: %s\n", msg); | ||
742 | 876 | ||
743 | mci->ce_noinfo_count++; | 877 | mci->ue_mc++; |
744 | mci->ce_count++; | ||
745 | } | ||
746 | EXPORT_SYMBOL_GPL(edac_mc_handle_ce_no_info); | ||
747 | 878 | ||
748 | void edac_mc_handle_ue(struct mem_ctl_info *mci, | 879 | if (!enable_per_layer_report) { |
749 | unsigned long page_frame_number, | 880 | mci->ce_noinfo_count++; |
750 | unsigned long offset_in_page, int row, const char *msg) | ||
751 | { | ||
752 | int len = EDAC_MC_LABEL_LEN * 4; | ||
753 | char labels[len + 1]; | ||
754 | char *pos = labels; | ||
755 | int chan; | ||
756 | int chars; | ||
757 | |||
758 | debugf3("MC%d: %s()\n", mci->mc_idx, __func__); | ||
759 | |||
760 | /* FIXME - maybe make panic on INTERNAL ERROR an option */ | ||
761 | if (row >= mci->nr_csrows || row < 0) { | ||
762 | /* something is wrong */ | ||
763 | edac_mc_printk(mci, KERN_ERR, | ||
764 | "INTERNAL ERROR: row out of range " | ||
765 | "(%d >= %d)\n", row, mci->nr_csrows); | ||
766 | edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); | ||
767 | return; | 881 | return; |
768 | } | 882 | } |
769 | 883 | ||
770 | chars = snprintf(pos, len + 1, "%s", | 884 | for (i = 0; i < mci->n_layers; i++) { |
771 | mci->csrows[row].channels[0].label); | 885 | if (pos[i] < 0) |
772 | len -= chars; | 886 | break; |
773 | pos += chars; | 887 | index += pos[i]; |
888 | mci->ue_per_layer[i][index]++; | ||
774 | 889 | ||
775 | for (chan = 1; (chan < mci->csrows[row].nr_channels) && (len > 0); | 890 | if (i < mci->n_layers - 1) |
776 | chan++) { | 891 | index *= mci->layers[i + 1].size; |
777 | chars = snprintf(pos, len + 1, ":%s", | ||
778 | mci->csrows[row].channels[chan].label); | ||
779 | len -= chars; | ||
780 | pos += chars; | ||
781 | } | 892 | } |
893 | } | ||
782 | 894 | ||
783 | if (edac_mc_get_log_ue()) | 895 | static void edac_ce_error(struct mem_ctl_info *mci, |
784 | edac_mc_printk(mci, KERN_EMERG, | 896 | const int pos[EDAC_MAX_LAYERS], |
785 | "UE page 0x%lx, offset 0x%lx, grain %d, row %d, " | 897 | const char *msg, |
786 | "labels \"%s\": %s\n", page_frame_number, | 898 | const char *location, |
787 | offset_in_page, mci->csrows[row].grain, row, | 899 | const char *label, |
788 | labels, msg); | 900 | const char *detail, |
901 | const char *other_detail, | ||
902 | const bool enable_per_layer_report, | ||
903 | const unsigned long page_frame_number, | ||
904 | const unsigned long offset_in_page, | ||
905 | u32 grain) | ||
906 | { | ||
907 | unsigned long remapped_page; | ||
789 | 908 | ||
790 | if (edac_mc_get_panic_on_ue()) | 909 | if (edac_mc_get_log_ce()) { |
791 | panic("EDAC MC%d: UE page 0x%lx, offset 0x%lx, grain %d, " | 910 | if (other_detail && *other_detail) |
792 | "row %d, labels \"%s\": %s\n", mci->mc_idx, | 911 | edac_mc_printk(mci, KERN_WARNING, |
793 | page_frame_number, offset_in_page, | 912 | "CE %s on %s (%s%s - %s)\n", |
794 | mci->csrows[row].grain, row, labels, msg); | 913 | msg, label, location, |
914 | detail, other_detail); | ||
915 | else | ||
916 | edac_mc_printk(mci, KERN_WARNING, | ||
917 | "CE %s on %s (%s%s)\n", | ||
918 | msg, label, location, | ||
919 | detail); | ||
920 | } | ||
921 | edac_inc_ce_error(mci, enable_per_layer_report, pos); | ||
795 | 922 | ||
796 | mci->ue_count++; | 923 | if (mci->scrub_mode & SCRUB_SW_SRC) { |
797 | mci->csrows[row].ue_count++; | 924 | /* |
925 | * Some memory controllers (called MCs below) can remap | ||
926 | * memory so that it is still available at a different | ||
927 | * address when PCI devices map into memory. | ||
928 | * MC's that can't do this, lose the memory where PCI | ||
929 | * devices are mapped. This mapping is MC-dependent | ||
930 | * and so we call back into the MC driver for it to | ||
931 | * map the MC page to a physical (CPU) page which can | ||
932 | * then be mapped to a virtual page - which can then | ||
933 | * be scrubbed. | ||
934 | */ | ||
935 | remapped_page = mci->ctl_page_to_phys ? | ||
936 | mci->ctl_page_to_phys(mci, page_frame_number) : | ||
937 | page_frame_number; | ||
938 | |||
939 | edac_mc_scrub_block(remapped_page, | ||
940 | offset_in_page, grain); | ||
941 | } | ||
798 | } | 942 | } |
799 | EXPORT_SYMBOL_GPL(edac_mc_handle_ue); | ||
800 | 943 | ||
801 | void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg) | 944 | static void edac_ue_error(struct mem_ctl_info *mci, |
945 | const int pos[EDAC_MAX_LAYERS], | ||
946 | const char *msg, | ||
947 | const char *location, | ||
948 | const char *label, | ||
949 | const char *detail, | ||
950 | const char *other_detail, | ||
951 | const bool enable_per_layer_report) | ||
802 | { | 952 | { |
803 | if (edac_mc_get_panic_on_ue()) | 953 | if (edac_mc_get_log_ue()) { |
804 | panic("EDAC MC%d: Uncorrected Error", mci->mc_idx); | 954 | if (other_detail && *other_detail) |
955 | edac_mc_printk(mci, KERN_WARNING, | ||
956 | "UE %s on %s (%s%s - %s)\n", | ||
957 | msg, label, location, detail, | ||
958 | other_detail); | ||
959 | else | ||
960 | edac_mc_printk(mci, KERN_WARNING, | ||
961 | "UE %s on %s (%s%s)\n", | ||
962 | msg, label, location, detail); | ||
963 | } | ||
805 | 964 | ||
806 | if (edac_mc_get_log_ue()) | 965 | if (edac_mc_get_panic_on_ue()) { |
807 | edac_mc_printk(mci, KERN_WARNING, | 966 | if (other_detail && *other_detail) |
808 | "UE - no information available: %s\n", msg); | 967 | panic("UE %s on %s (%s%s - %s)\n", |
809 | mci->ue_noinfo_count++; | 968 | msg, label, location, detail, other_detail); |
810 | mci->ue_count++; | 969 | else |
970 | panic("UE %s on %s (%s%s)\n", | ||
971 | msg, label, location, detail); | ||
972 | } | ||
973 | |||
974 | edac_inc_ue_error(mci, enable_per_layer_report, pos); | ||
811 | } | 975 | } |
812 | EXPORT_SYMBOL_GPL(edac_mc_handle_ue_no_info); | ||
813 | 976 | ||
814 | /************************************************************* | 977 | #define OTHER_LABEL " or " |
815 | * On Fully Buffered DIMM modules, this help function is | 978 | void edac_mc_handle_error(const enum hw_event_mc_err_type type, |
816 | * called to process UE events | 979 | struct mem_ctl_info *mci, |
817 | */ | 980 | const unsigned long page_frame_number, |
818 | void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, | 981 | const unsigned long offset_in_page, |
819 | unsigned int csrow, | 982 | const unsigned long syndrome, |
820 | unsigned int channela, | 983 | const int layer0, |
821 | unsigned int channelb, char *msg) | 984 | const int layer1, |
985 | const int layer2, | ||
986 | const char *msg, | ||
987 | const char *other_detail, | ||
988 | const void *mcelog) | ||
822 | { | 989 | { |
823 | int len = EDAC_MC_LABEL_LEN * 4; | 990 | /* FIXME: too much for stack: move it to some pre-alocated area */ |
824 | char labels[len + 1]; | 991 | char detail[80], location[80]; |
825 | char *pos = labels; | 992 | char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms]; |
826 | int chars; | 993 | char *p; |
994 | int row = -1, chan = -1; | ||
995 | int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 }; | ||
996 | int i; | ||
997 | u32 grain; | ||
998 | bool enable_per_layer_report = false; | ||
827 | 999 | ||
828 | if (csrow >= mci->nr_csrows) { | 1000 | debugf3("MC%d: %s()\n", mci->mc_idx, __func__); |
829 | /* something is wrong */ | ||
830 | edac_mc_printk(mci, KERN_ERR, | ||
831 | "INTERNAL ERROR: row out of range (%d >= %d)\n", | ||
832 | csrow, mci->nr_csrows); | ||
833 | edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); | ||
834 | return; | ||
835 | } | ||
836 | 1001 | ||
837 | if (channela >= mci->csrows[csrow].nr_channels) { | 1002 | /* |
838 | /* something is wrong */ | 1003 | * Check if the event report is consistent and if the memory |
839 | edac_mc_printk(mci, KERN_ERR, | 1004 | * location is known. If it is known, enable_per_layer_report will be |
840 | "INTERNAL ERROR: channel-a out of range " | 1005 | * true, the DIMM(s) label info will be filled and the per-layer |
841 | "(%d >= %d)\n", | 1006 | * error counters will be incremented. |
842 | channela, mci->csrows[csrow].nr_channels); | 1007 | */ |
843 | edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); | 1008 | for (i = 0; i < mci->n_layers; i++) { |
844 | return; | 1009 | if (pos[i] >= (int)mci->layers[i].size) { |
1010 | if (type == HW_EVENT_ERR_CORRECTED) | ||
1011 | p = "CE"; | ||
1012 | else | ||
1013 | p = "UE"; | ||
1014 | |||
1015 | edac_mc_printk(mci, KERN_ERR, | ||
1016 | "INTERNAL ERROR: %s value is out of range (%d >= %d)\n", | ||
1017 | edac_layer_name[mci->layers[i].type], | ||
1018 | pos[i], mci->layers[i].size); | ||
1019 | /* | ||
1020 | * Instead of just returning it, let's use what's | ||
1021 | * known about the error. The increment routines and | ||
1022 | * the DIMM filter logic will do the right thing by | ||
1023 | * pointing the likely damaged DIMMs. | ||
1024 | */ | ||
1025 | pos[i] = -1; | ||
1026 | } | ||
1027 | if (pos[i] >= 0) | ||
1028 | enable_per_layer_report = true; | ||
845 | } | 1029 | } |
846 | 1030 | ||
847 | if (channelb >= mci->csrows[csrow].nr_channels) { | 1031 | /* |
848 | /* something is wrong */ | 1032 | * Get the dimm label/grain that applies to the match criteria. |
849 | edac_mc_printk(mci, KERN_ERR, | 1033 | * As the error algorithm may not be able to point to just one memory |
850 | "INTERNAL ERROR: channel-b out of range " | 1034 | * stick, the logic here will get all possible labels that could |
851 | "(%d >= %d)\n", | 1035 | * pottentially be affected by the error. |
852 | channelb, mci->csrows[csrow].nr_channels); | 1036 | * On FB-DIMM memory controllers, for uncorrected errors, it is common |
853 | edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); | 1037 | * to have only the MC channel and the MC dimm (also called "branch") |
854 | return; | 1038 | * but the channel is not known, as the memory is arranged in pairs, |
855 | } | 1039 | * where each memory belongs to a separate channel within the same |
1040 | * branch. | ||
1041 | */ | ||
1042 | grain = 0; | ||
1043 | p = label; | ||
1044 | *p = '\0'; | ||
1045 | for (i = 0; i < mci->tot_dimms; i++) { | ||
1046 | struct dimm_info *dimm = &mci->dimms[i]; | ||
856 | 1047 | ||
857 | mci->ue_count++; | 1048 | if (layer0 >= 0 && layer0 != dimm->location[0]) |
858 | mci->csrows[csrow].ue_count++; | 1049 | continue; |
1050 | if (layer1 >= 0 && layer1 != dimm->location[1]) | ||
1051 | continue; | ||
1052 | if (layer2 >= 0 && layer2 != dimm->location[2]) | ||
1053 | continue; | ||
859 | 1054 | ||
860 | /* Generate the DIMM labels from the specified channels */ | 1055 | /* get the max grain, over the error match range */ |
861 | chars = snprintf(pos, len + 1, "%s", | 1056 | if (dimm->grain > grain) |
862 | mci->csrows[csrow].channels[channela].label); | 1057 | grain = dimm->grain; |
863 | len -= chars; | ||
864 | pos += chars; | ||
865 | chars = snprintf(pos, len + 1, "-%s", | ||
866 | mci->csrows[csrow].channels[channelb].label); | ||
867 | 1058 | ||
868 | if (edac_mc_get_log_ue()) | 1059 | /* |
869 | edac_mc_printk(mci, KERN_EMERG, | 1060 | * If the error is memory-controller wide, there's no need to |
870 | "UE row %d, channel-a= %d channel-b= %d " | 1061 | * seek for the affected DIMMs because the whole |
871 | "labels \"%s\": %s\n", csrow, channela, channelb, | 1062 | * channel/memory controller/... may be affected. |
872 | labels, msg); | 1063 | * Also, don't show errors for empty DIMM slots. |
1064 | */ | ||
1065 | if (enable_per_layer_report && dimm->nr_pages) { | ||
1066 | if (p != label) { | ||
1067 | strcpy(p, OTHER_LABEL); | ||
1068 | p += strlen(OTHER_LABEL); | ||
1069 | } | ||
1070 | strcpy(p, dimm->label); | ||
1071 | p += strlen(p); | ||
1072 | *p = '\0'; | ||
1073 | |||
1074 | /* | ||
1075 | * get csrow/channel of the DIMM, in order to allow | ||
1076 | * incrementing the compat API counters | ||
1077 | */ | ||
1078 | debugf4("%s: %s csrows map: (%d,%d)\n", | ||
1079 | __func__, | ||
1080 | mci->mem_is_per_rank ? "rank" : "dimm", | ||
1081 | dimm->csrow, dimm->cschannel); | ||
1082 | |||
1083 | if (row == -1) | ||
1084 | row = dimm->csrow; | ||
1085 | else if (row >= 0 && row != dimm->csrow) | ||
1086 | row = -2; | ||
1087 | |||
1088 | if (chan == -1) | ||
1089 | chan = dimm->cschannel; | ||
1090 | else if (chan >= 0 && chan != dimm->cschannel) | ||
1091 | chan = -2; | ||
1092 | } | ||
1093 | } | ||
873 | 1094 | ||
874 | if (edac_mc_get_panic_on_ue()) | 1095 | if (!enable_per_layer_report) { |
875 | panic("UE row %d, channel-a= %d channel-b= %d " | 1096 | strcpy(label, "any memory"); |
876 | "labels \"%s\": %s\n", csrow, channela, | 1097 | } else { |
877 | channelb, labels, msg); | 1098 | debugf4("%s: csrow/channel to increment: (%d,%d)\n", |
878 | } | 1099 | __func__, row, chan); |
879 | EXPORT_SYMBOL(edac_mc_handle_fbd_ue); | 1100 | if (p == label) |
1101 | strcpy(label, "unknown memory"); | ||
1102 | if (type == HW_EVENT_ERR_CORRECTED) { | ||
1103 | if (row >= 0) { | ||
1104 | mci->csrows[row].ce_count++; | ||
1105 | if (chan >= 0) | ||
1106 | mci->csrows[row].channels[chan].ce_count++; | ||
1107 | } | ||
1108 | } else | ||
1109 | if (row >= 0) | ||
1110 | mci->csrows[row].ue_count++; | ||
1111 | } | ||
880 | 1112 | ||
881 | /************************************************************* | 1113 | /* Fill the RAM location data */ |
882 | * On Fully Buffered DIMM modules, this help function is | 1114 | p = location; |
883 | * called to process CE events | 1115 | for (i = 0; i < mci->n_layers; i++) { |
884 | */ | 1116 | if (pos[i] < 0) |
885 | void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, | 1117 | continue; |
886 | unsigned int csrow, unsigned int channel, char *msg) | ||
887 | { | ||
888 | 1118 | ||
889 | /* Ensure boundary values */ | 1119 | p += sprintf(p, "%s:%d ", |
890 | if (csrow >= mci->nr_csrows) { | 1120 | edac_layer_name[mci->layers[i].type], |
891 | /* something is wrong */ | 1121 | pos[i]); |
892 | edac_mc_printk(mci, KERN_ERR, | ||
893 | "INTERNAL ERROR: row out of range (%d >= %d)\n", | ||
894 | csrow, mci->nr_csrows); | ||
895 | edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); | ||
896 | return; | ||
897 | } | ||
898 | if (channel >= mci->csrows[csrow].nr_channels) { | ||
899 | /* something is wrong */ | ||
900 | edac_mc_printk(mci, KERN_ERR, | ||
901 | "INTERNAL ERROR: channel out of range (%d >= %d)\n", | ||
902 | channel, mci->csrows[csrow].nr_channels); | ||
903 | edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); | ||
904 | return; | ||
905 | } | 1122 | } |
906 | 1123 | ||
907 | if (edac_mc_get_log_ce()) | 1124 | /* Memory type dependent details about the error */ |
908 | /* FIXME - put in DIMM location */ | 1125 | if (type == HW_EVENT_ERR_CORRECTED) { |
909 | edac_mc_printk(mci, KERN_WARNING, | 1126 | snprintf(detail, sizeof(detail), |
910 | "CE row %d, channel %d, label \"%s\": %s\n", | 1127 | "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx", |
911 | csrow, channel, | 1128 | page_frame_number, offset_in_page, |
912 | mci->csrows[csrow].channels[channel].label, msg); | 1129 | grain, syndrome); |
1130 | edac_ce_error(mci, pos, msg, location, label, detail, | ||
1131 | other_detail, enable_per_layer_report, | ||
1132 | page_frame_number, offset_in_page, grain); | ||
1133 | } else { | ||
1134 | snprintf(detail, sizeof(detail), | ||
1135 | "page:0x%lx offset:0x%lx grain:%d", | ||
1136 | page_frame_number, offset_in_page, grain); | ||
913 | 1137 | ||
914 | mci->ce_count++; | 1138 | edac_ue_error(mci, pos, msg, location, label, detail, |
915 | mci->csrows[csrow].ce_count++; | 1139 | other_detail, enable_per_layer_report); |
916 | mci->csrows[csrow].channels[channel].ce_count++; | 1140 | } |
917 | } | 1141 | } |
918 | EXPORT_SYMBOL(edac_mc_handle_fbd_ce); | 1142 | EXPORT_SYMBOL_GPL(edac_mc_handle_error); |