diff options
author | Doug Thompson <dougthompson@xmission.com> | 2009-05-06 11:55:27 -0400 |
---|---|---|
committer | Borislav Petkov <borislav.petkov@amd.com> | 2009-06-10 06:18:59 -0400 |
commit | d27bf6fa369ca0272df10558d2f290d6fc72e675 (patch) | |
tree | b43a34237e44dd567a34b3a3d2fd233905baf566 /drivers/edac/amd64_edac.c | |
parent | b1289d6f9d23abab396077abb65d5a23a775cdb0 (diff) |
amd64_edac: add error decoding logic
Borislav:
- fold amd64_error_info_valid() into its only user
- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls
Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Diffstat (limited to 'drivers/edac/amd64_edac.c')
-rw-r--r-- | drivers/edac/amd64_edac.c | 425 |
1 files changed, 425 insertions, 0 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index feb4986ea76d..09991c8a6ee3 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c | |||
@@ -2031,3 +2031,428 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome) | |||
2031 | debugf0("syndrome(%x) not found\n", syndrome); | 2031 | debugf0("syndrome(%x) not found\n", syndrome); |
2032 | return -1; | 2032 | return -1; |
2033 | } | 2033 | } |
2034 | |||
2035 | /* | ||
2036 | * Check for valid error in the NB Status High register. If so, proceed to read | ||
2037 | * NB Status Low, NB Address Low and NB Address High registers and store data | ||
2038 | * into error structure. | ||
2039 | * | ||
2040 | * Returns: | ||
2041 | * - 1: if hardware regs contains valid error info | ||
2042 | * - 0: if no valid error is indicated | ||
2043 | */ | ||
2044 | static int amd64_get_error_info_regs(struct mem_ctl_info *mci, | ||
2045 | struct amd64_error_info_regs *regs) | ||
2046 | { | ||
2047 | struct amd64_pvt *pvt; | ||
2048 | struct pci_dev *misc_f3_ctl; | ||
2049 | int err = 0; | ||
2050 | |||
2051 | pvt = mci->pvt_info; | ||
2052 | misc_f3_ctl = pvt->misc_f3_ctl; | ||
2053 | |||
2054 | err = pci_read_config_dword(misc_f3_ctl, K8_NBSH, ®s->nbsh); | ||
2055 | if (err) | ||
2056 | goto err_reg; | ||
2057 | |||
2058 | if (!(regs->nbsh & K8_NBSH_VALID_BIT)) | ||
2059 | return 0; | ||
2060 | |||
2061 | /* valid error, read remaining error information registers */ | ||
2062 | err = pci_read_config_dword(misc_f3_ctl, K8_NBSL, ®s->nbsl); | ||
2063 | if (err) | ||
2064 | goto err_reg; | ||
2065 | |||
2066 | err = pci_read_config_dword(misc_f3_ctl, K8_NBEAL, ®s->nbeal); | ||
2067 | if (err) | ||
2068 | goto err_reg; | ||
2069 | |||
2070 | err = pci_read_config_dword(misc_f3_ctl, K8_NBEAH, ®s->nbeah); | ||
2071 | if (err) | ||
2072 | goto err_reg; | ||
2073 | |||
2074 | err = pci_read_config_dword(misc_f3_ctl, K8_NBCFG, ®s->nbcfg); | ||
2075 | if (err) | ||
2076 | goto err_reg; | ||
2077 | |||
2078 | return 1; | ||
2079 | |||
2080 | err_reg: | ||
2081 | debugf0("Reading error info register failed\n"); | ||
2082 | return 0; | ||
2083 | } | ||
2084 | |||
2085 | /* | ||
2086 | * This function is called to retrieve the error data from hardware and store it | ||
2087 | * in the info structure. | ||
2088 | * | ||
2089 | * Returns: | ||
2090 | * - 1: if a valid error is found | ||
2091 | * - 0: if no error is found | ||
2092 | */ | ||
2093 | static int amd64_get_error_info(struct mem_ctl_info *mci, | ||
2094 | struct amd64_error_info_regs *info) | ||
2095 | { | ||
2096 | struct amd64_pvt *pvt; | ||
2097 | struct amd64_error_info_regs regs; | ||
2098 | |||
2099 | pvt = mci->pvt_info; | ||
2100 | |||
2101 | if (!amd64_get_error_info_regs(mci, info)) | ||
2102 | return 0; | ||
2103 | |||
2104 | /* | ||
2105 | * Here's the problem with the K8's EDAC reporting: There are four | ||
2106 | * registers which report pieces of error information. They are shared | ||
2107 | * between CEs and UEs. Furthermore, contrary to what is stated in the | ||
2108 | * BKDG, the overflow bit is never used! Every error always updates the | ||
2109 | * reporting registers. | ||
2110 | * | ||
2111 | * Can you see the race condition? All four error reporting registers | ||
2112 | * must be read before a new error updates them! There is no way to read | ||
2113 | * all four registers atomically. The best than can be done is to detect | ||
2114 | * that a race has occured and then report the error without any kind of | ||
2115 | * precision. | ||
2116 | * | ||
2117 | * What is still positive is that errors are still reported and thus | ||
2118 | * problems can still be detected - just not localized because the | ||
2119 | * syndrome and address are spread out across registers. | ||
2120 | * | ||
2121 | * Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev. | ||
2122 | * UEs and CEs should have separate register sets with proper overflow | ||
2123 | * bits that are used! At very least the problem can be fixed by | ||
2124 | * honoring the ErrValid bit in 'nbsh' and not updating registers - just | ||
2125 | * set the overflow bit - unless the current error is CE and the new | ||
2126 | * error is UE which would be the only situation for overwriting the | ||
2127 | * current values. | ||
2128 | */ | ||
2129 | |||
2130 | regs = *info; | ||
2131 | |||
2132 | /* Use info from the second read - most current */ | ||
2133 | if (unlikely(!amd64_get_error_info_regs(mci, info))) | ||
2134 | return 0; | ||
2135 | |||
2136 | /* clear the error bits in hardware */ | ||
2137 | pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT); | ||
2138 | |||
2139 | /* Check for the possible race condition */ | ||
2140 | if ((regs.nbsh != info->nbsh) || | ||
2141 | (regs.nbsl != info->nbsl) || | ||
2142 | (regs.nbeah != info->nbeah) || | ||
2143 | (regs.nbeal != info->nbeal)) { | ||
2144 | amd64_mc_printk(mci, KERN_WARNING, | ||
2145 | "hardware STATUS read access race condition " | ||
2146 | "detected!\n"); | ||
2147 | return 0; | ||
2148 | } | ||
2149 | return 1; | ||
2150 | } | ||
2151 | |||
2152 | static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, | ||
2153 | struct amd64_error_info_regs *info) | ||
2154 | { | ||
2155 | u32 err_code; | ||
2156 | u32 ec_tt; /* error code transaction type (2b) */ | ||
2157 | u32 ec_ll; /* error code cache level (2b) */ | ||
2158 | |||
2159 | err_code = EXTRACT_ERROR_CODE(info->nbsl); | ||
2160 | ec_ll = EXTRACT_LL_CODE(err_code); | ||
2161 | ec_tt = EXTRACT_TT_CODE(err_code); | ||
2162 | |||
2163 | amd64_mc_printk(mci, KERN_ERR, | ||
2164 | "GART TLB event: transaction type(%s), " | ||
2165 | "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]); | ||
2166 | } | ||
2167 | |||
2168 | static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, | ||
2169 | struct amd64_error_info_regs *info) | ||
2170 | { | ||
2171 | u32 err_code; | ||
2172 | u32 ec_rrrr; /* error code memory transaction (4b) */ | ||
2173 | u32 ec_tt; /* error code transaction type (2b) */ | ||
2174 | u32 ec_ll; /* error code cache level (2b) */ | ||
2175 | |||
2176 | err_code = EXTRACT_ERROR_CODE(info->nbsl); | ||
2177 | ec_ll = EXTRACT_LL_CODE(err_code); | ||
2178 | ec_tt = EXTRACT_TT_CODE(err_code); | ||
2179 | ec_rrrr = EXTRACT_RRRR_CODE(err_code); | ||
2180 | |||
2181 | amd64_mc_printk(mci, KERN_ERR, | ||
2182 | "cache hierarchy error: memory transaction type(%s), " | ||
2183 | "transaction type(%s), cache level(%s)\n", | ||
2184 | rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]); | ||
2185 | } | ||
2186 | |||
2187 | |||
2188 | /* | ||
2189 | * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR | ||
2190 | * ADDRESS and process. | ||
2191 | */ | ||
2192 | static void amd64_handle_ce(struct mem_ctl_info *mci, | ||
2193 | struct amd64_error_info_regs *info) | ||
2194 | { | ||
2195 | struct amd64_pvt *pvt = mci->pvt_info; | ||
2196 | u64 SystemAddress; | ||
2197 | |||
2198 | /* Ensure that the Error Address is VALID */ | ||
2199 | if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) { | ||
2200 | amd64_mc_printk(mci, KERN_ERR, | ||
2201 | "HW has no ERROR_ADDRESS available\n"); | ||
2202 | edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR); | ||
2203 | return; | ||
2204 | } | ||
2205 | |||
2206 | SystemAddress = extract_error_address(mci, info); | ||
2207 | |||
2208 | amd64_mc_printk(mci, KERN_ERR, | ||
2209 | "CE ERROR_ADDRESS= 0x%llx\n", SystemAddress); | ||
2210 | |||
2211 | pvt->ops->map_sysaddr_to_csrow(mci, info, SystemAddress); | ||
2212 | } | ||
2213 | |||
2214 | /* Handle any Un-correctable Errors (UEs) */ | ||
2215 | static void amd64_handle_ue(struct mem_ctl_info *mci, | ||
2216 | struct amd64_error_info_regs *info) | ||
2217 | { | ||
2218 | int csrow; | ||
2219 | u64 SystemAddress; | ||
2220 | u32 page, offset; | ||
2221 | struct mem_ctl_info *log_mci, *src_mci = NULL; | ||
2222 | |||
2223 | log_mci = mci; | ||
2224 | |||
2225 | if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) { | ||
2226 | amd64_mc_printk(mci, KERN_CRIT, | ||
2227 | "HW has no ERROR_ADDRESS available\n"); | ||
2228 | edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR); | ||
2229 | return; | ||
2230 | } | ||
2231 | |||
2232 | SystemAddress = extract_error_address(mci, info); | ||
2233 | |||
2234 | /* | ||
2235 | * Find out which node the error address belongs to. This may be | ||
2236 | * different from the node that detected the error. | ||
2237 | */ | ||
2238 | src_mci = find_mc_by_sys_addr(mci, SystemAddress); | ||
2239 | if (!src_mci) { | ||
2240 | amd64_mc_printk(mci, KERN_CRIT, | ||
2241 | "ERROR ADDRESS (0x%lx) value NOT mapped to a MC\n", | ||
2242 | (unsigned long)SystemAddress); | ||
2243 | edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR); | ||
2244 | return; | ||
2245 | } | ||
2246 | |||
2247 | log_mci = src_mci; | ||
2248 | |||
2249 | csrow = sys_addr_to_csrow(log_mci, SystemAddress); | ||
2250 | if (csrow < 0) { | ||
2251 | amd64_mc_printk(mci, KERN_CRIT, | ||
2252 | "ERROR_ADDRESS (0x%lx) value NOT mapped to 'csrow'\n", | ||
2253 | (unsigned long)SystemAddress); | ||
2254 | edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR); | ||
2255 | } else { | ||
2256 | error_address_to_page_and_offset(SystemAddress, &page, &offset); | ||
2257 | edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR); | ||
2258 | } | ||
2259 | } | ||
2260 | |||
2261 | static void amd64_decode_bus_error(struct mem_ctl_info *mci, | ||
2262 | struct amd64_error_info_regs *info) | ||
2263 | { | ||
2264 | u32 err_code, ext_ec; | ||
2265 | u32 ec_pp; /* error code participating processor (2p) */ | ||
2266 | u32 ec_to; /* error code timed out (1b) */ | ||
2267 | u32 ec_rrrr; /* error code memory transaction (4b) */ | ||
2268 | u32 ec_ii; /* error code memory or I/O (2b) */ | ||
2269 | u32 ec_ll; /* error code cache level (2b) */ | ||
2270 | |||
2271 | ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); | ||
2272 | err_code = EXTRACT_ERROR_CODE(info->nbsl); | ||
2273 | |||
2274 | ec_ll = EXTRACT_LL_CODE(err_code); | ||
2275 | ec_ii = EXTRACT_II_CODE(err_code); | ||
2276 | ec_rrrr = EXTRACT_RRRR_CODE(err_code); | ||
2277 | ec_to = EXTRACT_TO_CODE(err_code); | ||
2278 | ec_pp = EXTRACT_PP_CODE(err_code); | ||
2279 | |||
2280 | amd64_mc_printk(mci, KERN_ERR, | ||
2281 | "BUS ERROR:\n" | ||
2282 | " time-out(%s) mem or i/o(%s)\n" | ||
2283 | " participating processor(%s)\n" | ||
2284 | " memory transaction type(%s)\n" | ||
2285 | " cache level(%s) Error Found by: %s\n", | ||
2286 | to_msgs[ec_to], | ||
2287 | ii_msgs[ec_ii], | ||
2288 | pp_msgs[ec_pp], | ||
2289 | rrrr_msgs[ec_rrrr], | ||
2290 | ll_msgs[ec_ll], | ||
2291 | (info->nbsh & K8_NBSH_ERR_SCRUBER) ? | ||
2292 | "Scrubber" : "Normal Operation"); | ||
2293 | |||
2294 | /* If this was an 'observed' error, early out */ | ||
2295 | if (ec_pp == K8_NBSL_PP_OBS) | ||
2296 | return; /* We aren't the node involved */ | ||
2297 | |||
2298 | /* Parse out the extended error code for ECC events */ | ||
2299 | switch (ext_ec) { | ||
2300 | /* F10 changed to one Extended ECC error code */ | ||
2301 | case F10_NBSL_EXT_ERR_RES: /* Reserved field */ | ||
2302 | case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ | ||
2303 | break; | ||
2304 | |||
2305 | default: | ||
2306 | amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error " | ||
2307 | "handling for this error\n"); | ||
2308 | return; | ||
2309 | } | ||
2310 | |||
2311 | if (info->nbsh & K8_NBSH_CECC) | ||
2312 | amd64_handle_ce(mci, info); | ||
2313 | else if (info->nbsh & K8_NBSH_UECC) | ||
2314 | amd64_handle_ue(mci, info); | ||
2315 | |||
2316 | /* | ||
2317 | * If main error is CE then overflow must be CE. If main error is UE | ||
2318 | * then overflow is unknown. We'll call the overflow a CE - if | ||
2319 | * panic_on_ue is set then we're already panic'ed and won't arrive | ||
2320 | * here. Else, then apparently someone doesn't think that UE's are | ||
2321 | * catastrophic. | ||
2322 | */ | ||
2323 | if (info->nbsh & K8_NBSH_OVERFLOW) | ||
2324 | edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR | ||
2325 | "Error Overflow set"); | ||
2326 | } | ||
2327 | |||
2328 | int amd64_process_error_info(struct mem_ctl_info *mci, | ||
2329 | struct amd64_error_info_regs *info, | ||
2330 | int handle_errors) | ||
2331 | { | ||
2332 | struct amd64_pvt *pvt; | ||
2333 | struct amd64_error_info_regs *regs; | ||
2334 | u32 err_code, ext_ec; | ||
2335 | int gart_tlb_error = 0; | ||
2336 | |||
2337 | pvt = mci->pvt_info; | ||
2338 | |||
2339 | /* If caller doesn't want us to process the error, return */ | ||
2340 | if (!handle_errors) | ||
2341 | return 1; | ||
2342 | |||
2343 | regs = info; | ||
2344 | |||
2345 | debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); | ||
2346 | debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", | ||
2347 | pvt->mc_node_id, regs->nbeah, regs->nbeal); | ||
2348 | debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n", | ||
2349 | regs->nbsh, regs->nbsl); | ||
2350 | debugf1(" Valid Error=%s Overflow=%s\n", | ||
2351 | (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False", | ||
2352 | (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False"); | ||
2353 | debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n", | ||
2354 | (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ? | ||
2355 | "True" : "False", | ||
2356 | (regs->nbsh & K8_NBSH_ERR_ENABLE) ? | ||
2357 | "True" : "False"); | ||
2358 | debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n", | ||
2359 | (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ? | ||
2360 | "True" : "False", | ||
2361 | (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ? | ||
2362 | "True" : "False", | ||
2363 | (regs->nbsh & K8_NBSH_PCC) ? | ||
2364 | "True" : "False"); | ||
2365 | debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n", | ||
2366 | (regs->nbsh & K8_NBSH_CECC) ? | ||
2367 | "True" : "False", | ||
2368 | (regs->nbsh & K8_NBSH_UECC) ? | ||
2369 | "True" : "False", | ||
2370 | (regs->nbsh & K8_NBSH_ERR_SCRUBER) ? | ||
2371 | "True" : "False"); | ||
2372 | debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n", | ||
2373 | (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False", | ||
2374 | (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False", | ||
2375 | (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False", | ||
2376 | (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); | ||
2377 | |||
2378 | |||
2379 | err_code = EXTRACT_ERROR_CODE(regs->nbsl); | ||
2380 | |||
2381 | /* Determine which error type: | ||
2382 | * 1) GART errors - non-fatal, developmental events | ||
2383 | * 2) MEMORY errors | ||
2384 | * 3) BUS errors | ||
2385 | * 4) Unknown error | ||
2386 | */ | ||
2387 | if (TEST_TLB_ERROR(err_code)) { | ||
2388 | /* | ||
2389 | * GART errors are intended to help graphics driver developers | ||
2390 | * to detect bad GART PTEs. It is recommended by AMD to disable | ||
2391 | * GART table walk error reporting by default[1] (currently | ||
2392 | * being disabled in mce_cpu_quirks()) and according to the | ||
2393 | * comment in mce_cpu_quirks(), such GART errors can be | ||
2394 | * incorrectly triggered. We may see these errors anyway and | ||
2395 | * unless requested by the user, they won't be reported. | ||
2396 | * | ||
2397 | * [1] section 13.10.1 on BIOS and Kernel Developers Guide for | ||
2398 | * AMD NPT family 0Fh processors | ||
2399 | */ | ||
2400 | if (report_gart_errors == 0) | ||
2401 | return 1; | ||
2402 | |||
2403 | /* | ||
2404 | * Only if GART error reporting is requested should we generate | ||
2405 | * any logs. | ||
2406 | */ | ||
2407 | gart_tlb_error = 1; | ||
2408 | |||
2409 | debugf1("GART TLB error\n"); | ||
2410 | amd64_decode_gart_tlb_error(mci, info); | ||
2411 | } else if (TEST_MEM_ERROR(err_code)) { | ||
2412 | debugf1("Memory/Cache error\n"); | ||
2413 | amd64_decode_mem_cache_error(mci, info); | ||
2414 | } else if (TEST_BUS_ERROR(err_code)) { | ||
2415 | debugf1("Bus (Link/DRAM) error\n"); | ||
2416 | amd64_decode_bus_error(mci, info); | ||
2417 | } else { | ||
2418 | /* shouldn't reach here! */ | ||
2419 | amd64_mc_printk(mci, KERN_WARNING, | ||
2420 | "%s(): unknown MCE error 0x%x\n", __func__, | ||
2421 | err_code); | ||
2422 | } | ||
2423 | |||
2424 | ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl); | ||
2425 | amd64_mc_printk(mci, KERN_ERR, | ||
2426 | "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); | ||
2427 | |||
2428 | if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && | ||
2429 | ext_ec <= F10_NBSL_EXT_ERR_TGT) || | ||
2430 | (ext_ec == F10_NBSL_EXT_ERR_RMW)) && | ||
2431 | EXTRACT_LDT_LINK(info->nbsh)) { | ||
2432 | |||
2433 | amd64_mc_printk(mci, KERN_ERR, | ||
2434 | "Error on hypertransport link: %s\n", | ||
2435 | htlink_msgs[ | ||
2436 | EXTRACT_LDT_LINK(info->nbsh)]); | ||
2437 | } | ||
2438 | |||
2439 | /* | ||
2440 | * Check the UE bit of the NB status high register, if set generate some | ||
2441 | * logs. If NOT a GART error, then process the event as a NO-INFO event. | ||
2442 | * If it was a GART error, skip that process. | ||
2443 | */ | ||
2444 | if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { | ||
2445 | amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); | ||
2446 | if (!gart_tlb_error) | ||
2447 | edac_mc_handle_ue_no_info(mci, "UE bit is set\n"); | ||
2448 | } | ||
2449 | |||
2450 | if (regs->nbsh & K8_NBSH_PCC) | ||
2451 | amd64_mc_printk(mci, KERN_CRIT, | ||
2452 | "PCC (processor context corrupt) set\n"); | ||
2453 | |||
2454 | return 1; | ||
2455 | } | ||
2456 | EXPORT_SYMBOL_GPL(amd64_process_error_info); | ||
2457 | |||
2458 | |||