aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDoug Thompson <dougthompson@xmission.com>2009-05-06 11:55:27 -0400
committerBorislav Petkov <borislav.petkov@amd.com>2009-06-10 06:18:59 -0400
commitd27bf6fa369ca0272df10558d2f290d6fc72e675 (patch)
treeb43a34237e44dd567a34b3a3d2fd233905baf566
parentb1289d6f9d23abab396077abb65d5a23a775cdb0 (diff)
amd64_edac: add error decoding logic
Borislav: - fold amd64_error_info_valid() into its only user - fix/cleanup comments - fix function return value patterns - cleanup debug calls Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com> Signed-off-by: Doug Thompson <dougthompson@xmission.com> Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
-rw-r--r--drivers/edac/amd64_edac.c425
1 files changed, 425 insertions, 0 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index feb4986ea76d..09991c8a6ee3 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2031,3 +2031,428 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
2031 debugf0("syndrome(%x) not found\n", syndrome); 2031 debugf0("syndrome(%x) not found\n", syndrome);
2032 return -1; 2032 return -1;
2033} 2033}
2034
2035/*
2036 * Check for valid error in the NB Status High register. If so, proceed to read
2037 * NB Status Low, NB Address Low and NB Address High registers and store data
2038 * into error structure.
2039 *
2040 * Returns:
2041 * - 1: if hardware regs contains valid error info
2042 * - 0: if no valid error is indicated
2043 */
2044static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
2045 struct amd64_error_info_regs *regs)
2046{
2047 struct amd64_pvt *pvt;
2048 struct pci_dev *misc_f3_ctl;
2049 int err = 0;
2050
2051 pvt = mci->pvt_info;
2052 misc_f3_ctl = pvt->misc_f3_ctl;
2053
2054 err = pci_read_config_dword(misc_f3_ctl, K8_NBSH, &regs->nbsh);
2055 if (err)
2056 goto err_reg;
2057
2058 if (!(regs->nbsh & K8_NBSH_VALID_BIT))
2059 return 0;
2060
2061 /* valid error, read remaining error information registers */
2062 err = pci_read_config_dword(misc_f3_ctl, K8_NBSL, &regs->nbsl);
2063 if (err)
2064 goto err_reg;
2065
2066 err = pci_read_config_dword(misc_f3_ctl, K8_NBEAL, &regs->nbeal);
2067 if (err)
2068 goto err_reg;
2069
2070 err = pci_read_config_dword(misc_f3_ctl, K8_NBEAH, &regs->nbeah);
2071 if (err)
2072 goto err_reg;
2073
2074 err = pci_read_config_dword(misc_f3_ctl, K8_NBCFG, &regs->nbcfg);
2075 if (err)
2076 goto err_reg;
2077
2078 return 1;
2079
2080err_reg:
2081 debugf0("Reading error info register failed\n");
2082 return 0;
2083}
2084
2085/*
2086 * This function is called to retrieve the error data from hardware and store it
2087 * in the info structure.
2088 *
2089 * Returns:
2090 * - 1: if a valid error is found
2091 * - 0: if no error is found
2092 */
2093static int amd64_get_error_info(struct mem_ctl_info *mci,
2094 struct amd64_error_info_regs *info)
2095{
2096 struct amd64_pvt *pvt;
2097 struct amd64_error_info_regs regs;
2098
2099 pvt = mci->pvt_info;
2100
2101 if (!amd64_get_error_info_regs(mci, info))
2102 return 0;
2103
2104 /*
2105 * Here's the problem with the K8's EDAC reporting: There are four
2106 * registers which report pieces of error information. They are shared
2107 * between CEs and UEs. Furthermore, contrary to what is stated in the
2108 * BKDG, the overflow bit is never used! Every error always updates the
2109 * reporting registers.
2110 *
2111 * Can you see the race condition? All four error reporting registers
2112 * must be read before a new error updates them! There is no way to read
2113 * all four registers atomically. The best than can be done is to detect
2114 * that a race has occured and then report the error without any kind of
2115 * precision.
2116 *
2117 * What is still positive is that errors are still reported and thus
2118 * problems can still be detected - just not localized because the
2119 * syndrome and address are spread out across registers.
2120 *
2121 * Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev.
2122 * UEs and CEs should have separate register sets with proper overflow
2123 * bits that are used! At very least the problem can be fixed by
2124 * honoring the ErrValid bit in 'nbsh' and not updating registers - just
2125 * set the overflow bit - unless the current error is CE and the new
2126 * error is UE which would be the only situation for overwriting the
2127 * current values.
2128 */
2129
2130 regs = *info;
2131
2132 /* Use info from the second read - most current */
2133 if (unlikely(!amd64_get_error_info_regs(mci, info)))
2134 return 0;
2135
2136 /* clear the error bits in hardware */
2137 pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
2138
2139 /* Check for the possible race condition */
2140 if ((regs.nbsh != info->nbsh) ||
2141 (regs.nbsl != info->nbsl) ||
2142 (regs.nbeah != info->nbeah) ||
2143 (regs.nbeal != info->nbeal)) {
2144 amd64_mc_printk(mci, KERN_WARNING,
2145 "hardware STATUS read access race condition "
2146 "detected!\n");
2147 return 0;
2148 }
2149 return 1;
2150}
2151
2152static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
2153 struct amd64_error_info_regs *info)
2154{
2155 u32 err_code;
2156 u32 ec_tt; /* error code transaction type (2b) */
2157 u32 ec_ll; /* error code cache level (2b) */
2158
2159 err_code = EXTRACT_ERROR_CODE(info->nbsl);
2160 ec_ll = EXTRACT_LL_CODE(err_code);
2161 ec_tt = EXTRACT_TT_CODE(err_code);
2162
2163 amd64_mc_printk(mci, KERN_ERR,
2164 "GART TLB event: transaction type(%s), "
2165 "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
2166}
2167
2168static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
2169 struct amd64_error_info_regs *info)
2170{
2171 u32 err_code;
2172 u32 ec_rrrr; /* error code memory transaction (4b) */
2173 u32 ec_tt; /* error code transaction type (2b) */
2174 u32 ec_ll; /* error code cache level (2b) */
2175
2176 err_code = EXTRACT_ERROR_CODE(info->nbsl);
2177 ec_ll = EXTRACT_LL_CODE(err_code);
2178 ec_tt = EXTRACT_TT_CODE(err_code);
2179 ec_rrrr = EXTRACT_RRRR_CODE(err_code);
2180
2181 amd64_mc_printk(mci, KERN_ERR,
2182 "cache hierarchy error: memory transaction type(%s), "
2183 "transaction type(%s), cache level(%s)\n",
2184 rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
2185}
2186
2187
2188/*
2189 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
2190 * ADDRESS and process.
2191 */
2192static void amd64_handle_ce(struct mem_ctl_info *mci,
2193 struct amd64_error_info_regs *info)
2194{
2195 struct amd64_pvt *pvt = mci->pvt_info;
2196 u64 SystemAddress;
2197
2198 /* Ensure that the Error Address is VALID */
2199 if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
2200 amd64_mc_printk(mci, KERN_ERR,
2201 "HW has no ERROR_ADDRESS available\n");
2202 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
2203 return;
2204 }
2205
2206 SystemAddress = extract_error_address(mci, info);
2207
2208 amd64_mc_printk(mci, KERN_ERR,
2209 "CE ERROR_ADDRESS= 0x%llx\n", SystemAddress);
2210
2211 pvt->ops->map_sysaddr_to_csrow(mci, info, SystemAddress);
2212}
2213
2214/* Handle any Un-correctable Errors (UEs) */
2215static void amd64_handle_ue(struct mem_ctl_info *mci,
2216 struct amd64_error_info_regs *info)
2217{
2218 int csrow;
2219 u64 SystemAddress;
2220 u32 page, offset;
2221 struct mem_ctl_info *log_mci, *src_mci = NULL;
2222
2223 log_mci = mci;
2224
2225 if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
2226 amd64_mc_printk(mci, KERN_CRIT,
2227 "HW has no ERROR_ADDRESS available\n");
2228 edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
2229 return;
2230 }
2231
2232 SystemAddress = extract_error_address(mci, info);
2233
2234 /*
2235 * Find out which node the error address belongs to. This may be
2236 * different from the node that detected the error.
2237 */
2238 src_mci = find_mc_by_sys_addr(mci, SystemAddress);
2239 if (!src_mci) {
2240 amd64_mc_printk(mci, KERN_CRIT,
2241 "ERROR ADDRESS (0x%lx) value NOT mapped to a MC\n",
2242 (unsigned long)SystemAddress);
2243 edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
2244 return;
2245 }
2246
2247 log_mci = src_mci;
2248
2249 csrow = sys_addr_to_csrow(log_mci, SystemAddress);
2250 if (csrow < 0) {
2251 amd64_mc_printk(mci, KERN_CRIT,
2252 "ERROR_ADDRESS (0x%lx) value NOT mapped to 'csrow'\n",
2253 (unsigned long)SystemAddress);
2254 edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
2255 } else {
2256 error_address_to_page_and_offset(SystemAddress, &page, &offset);
2257 edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR);
2258 }
2259}
2260
2261static void amd64_decode_bus_error(struct mem_ctl_info *mci,
2262 struct amd64_error_info_regs *info)
2263{
2264 u32 err_code, ext_ec;
2265 u32 ec_pp; /* error code participating processor (2p) */
2266 u32 ec_to; /* error code timed out (1b) */
2267 u32 ec_rrrr; /* error code memory transaction (4b) */
2268 u32 ec_ii; /* error code memory or I/O (2b) */
2269 u32 ec_ll; /* error code cache level (2b) */
2270
2271 ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
2272 err_code = EXTRACT_ERROR_CODE(info->nbsl);
2273
2274 ec_ll = EXTRACT_LL_CODE(err_code);
2275 ec_ii = EXTRACT_II_CODE(err_code);
2276 ec_rrrr = EXTRACT_RRRR_CODE(err_code);
2277 ec_to = EXTRACT_TO_CODE(err_code);
2278 ec_pp = EXTRACT_PP_CODE(err_code);
2279
2280 amd64_mc_printk(mci, KERN_ERR,
2281 "BUS ERROR:\n"
2282 " time-out(%s) mem or i/o(%s)\n"
2283 " participating processor(%s)\n"
2284 " memory transaction type(%s)\n"
2285 " cache level(%s) Error Found by: %s\n",
2286 to_msgs[ec_to],
2287 ii_msgs[ec_ii],
2288 pp_msgs[ec_pp],
2289 rrrr_msgs[ec_rrrr],
2290 ll_msgs[ec_ll],
2291 (info->nbsh & K8_NBSH_ERR_SCRUBER) ?
2292 "Scrubber" : "Normal Operation");
2293
2294 /* If this was an 'observed' error, early out */
2295 if (ec_pp == K8_NBSL_PP_OBS)
2296 return; /* We aren't the node involved */
2297
2298 /* Parse out the extended error code for ECC events */
2299 switch (ext_ec) {
2300 /* F10 changed to one Extended ECC error code */
2301 case F10_NBSL_EXT_ERR_RES: /* Reserved field */
2302 case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */
2303 break;
2304
2305 default:
2306 amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
2307 "handling for this error\n");
2308 return;
2309 }
2310
2311 if (info->nbsh & K8_NBSH_CECC)
2312 amd64_handle_ce(mci, info);
2313 else if (info->nbsh & K8_NBSH_UECC)
2314 amd64_handle_ue(mci, info);
2315
2316 /*
2317 * If main error is CE then overflow must be CE. If main error is UE
2318 * then overflow is unknown. We'll call the overflow a CE - if
2319 * panic_on_ue is set then we're already panic'ed and won't arrive
2320 * here. Else, then apparently someone doesn't think that UE's are
2321 * catastrophic.
2322 */
2323 if (info->nbsh & K8_NBSH_OVERFLOW)
2324 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
2325 "Error Overflow set");
2326}
2327
2328int amd64_process_error_info(struct mem_ctl_info *mci,
2329 struct amd64_error_info_regs *info,
2330 int handle_errors)
2331{
2332 struct amd64_pvt *pvt;
2333 struct amd64_error_info_regs *regs;
2334 u32 err_code, ext_ec;
2335 int gart_tlb_error = 0;
2336
2337 pvt = mci->pvt_info;
2338
2339 /* If caller doesn't want us to process the error, return */
2340 if (!handle_errors)
2341 return 1;
2342
2343 regs = info;
2344
2345 debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
2346 debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
2347 pvt->mc_node_id, regs->nbeah, regs->nbeal);
2348 debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n",
2349 regs->nbsh, regs->nbsl);
2350 debugf1(" Valid Error=%s Overflow=%s\n",
2351 (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
2352 (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
2353 debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n",
2354 (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
2355 "True" : "False",
2356 (regs->nbsh & K8_NBSH_ERR_ENABLE) ?
2357 "True" : "False");
2358 debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
2359 (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
2360 "True" : "False",
2361 (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
2362 "True" : "False",
2363 (regs->nbsh & K8_NBSH_PCC) ?
2364 "True" : "False");
2365 debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n",
2366 (regs->nbsh & K8_NBSH_CECC) ?
2367 "True" : "False",
2368 (regs->nbsh & K8_NBSH_UECC) ?
2369 "True" : "False",
2370 (regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
2371 "True" : "False");
2372 debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
2373 (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
2374 (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
2375 (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
2376 (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
2377
2378
2379 err_code = EXTRACT_ERROR_CODE(regs->nbsl);
2380
2381 /* Determine which error type:
2382 * 1) GART errors - non-fatal, developmental events
2383 * 2) MEMORY errors
2384 * 3) BUS errors
2385 * 4) Unknown error
2386 */
2387 if (TEST_TLB_ERROR(err_code)) {
2388 /*
2389 * GART errors are intended to help graphics driver developers
2390 * to detect bad GART PTEs. It is recommended by AMD to disable
2391 * GART table walk error reporting by default[1] (currently
2392 * being disabled in mce_cpu_quirks()) and according to the
2393 * comment in mce_cpu_quirks(), such GART errors can be
2394 * incorrectly triggered. We may see these errors anyway and
2395 * unless requested by the user, they won't be reported.
2396 *
2397 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
2398 * AMD NPT family 0Fh processors
2399 */
2400 if (report_gart_errors == 0)
2401 return 1;
2402
2403 /*
2404 * Only if GART error reporting is requested should we generate
2405 * any logs.
2406 */
2407 gart_tlb_error = 1;
2408
2409 debugf1("GART TLB error\n");
2410 amd64_decode_gart_tlb_error(mci, info);
2411 } else if (TEST_MEM_ERROR(err_code)) {
2412 debugf1("Memory/Cache error\n");
2413 amd64_decode_mem_cache_error(mci, info);
2414 } else if (TEST_BUS_ERROR(err_code)) {
2415 debugf1("Bus (Link/DRAM) error\n");
2416 amd64_decode_bus_error(mci, info);
2417 } else {
2418 /* shouldn't reach here! */
2419 amd64_mc_printk(mci, KERN_WARNING,
2420 "%s(): unknown MCE error 0x%x\n", __func__,
2421 err_code);
2422 }
2423
2424 ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
2425 amd64_mc_printk(mci, KERN_ERR,
2426 "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
2427
2428 if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
2429 ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
2430 (ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
2431 EXTRACT_LDT_LINK(info->nbsh)) {
2432
2433 amd64_mc_printk(mci, KERN_ERR,
2434 "Error on hypertransport link: %s\n",
2435 htlink_msgs[
2436 EXTRACT_LDT_LINK(info->nbsh)]);
2437 }
2438
2439 /*
2440 * Check the UE bit of the NB status high register, if set generate some
2441 * logs. If NOT a GART error, then process the event as a NO-INFO event.
2442 * If it was a GART error, skip that process.
2443 */
2444 if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
2445 amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
2446 if (!gart_tlb_error)
2447 edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
2448 }
2449
2450 if (regs->nbsh & K8_NBSH_PCC)
2451 amd64_mc_printk(mci, KERN_CRIT,
2452 "PCC (processor context corrupt) set\n");
2453
2454 return 1;
2455}
2456EXPORT_SYMBOL_GPL(amd64_process_error_info);
2457
2458