aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorGavin Shan <shangw@linux.vnet.ibm.com>2013-06-20 01:21:01 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-06-20 03:06:04 -0400
commit5a71978e4b6ee6a01bc6aab926a3571055123029 (patch)
treed879a16b3d1d78e90b84070633375064fa4c5a0c /arch
parentc86085580d5f60d2d3cea9c60d50e284558d3de7 (diff)
powerpc/eeh: Trace time on first error for PE
We're not expecting that one specific PE got frozen for over 5 times in last hour. Otherwise, the PE will be removed from the system upon newly coming EEH errors. The patch introduces time stamp to trace the first error on specific PE in last hour and function to update that accordingly. Besides, the time stamp is recovered during PE hotplug path as we did for frozen count. Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/include/asm/eeh.h3
-rw-r--r--arch/powerpc/kernel/eeh_driver.c5
-rw-r--r--arch/powerpc/kernel/eeh_pe.c27
3 files changed, 35 insertions, 0 deletions
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index beec7883d93e..e1109fd87ff4 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -24,6 +24,7 @@
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/list.h> 25#include <linux/list.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/time.h>
27 28
28struct pci_dev; 29struct pci_dev;
29struct pci_bus; 30struct pci_bus;
@@ -62,6 +63,7 @@ struct eeh_pe {
62 struct pci_bus *bus; /* Top PCI bus for bus PE */ 63 struct pci_bus *bus; /* Top PCI bus for bus PE */
63 int check_count; /* Times of ignored error */ 64 int check_count; /* Times of ignored error */
64 int freeze_count; /* Times of froze up */ 65 int freeze_count; /* Times of froze up */
66 struct timeval tstamp; /* Time on first-time freeze */
65 int false_positives; /* Times of reported #ff's */ 67 int false_positives; /* Times of reported #ff's */
66 struct eeh_pe *parent; /* Parent PE */ 68 struct eeh_pe *parent; /* Parent PE */
67 struct list_head child_list; /* Link PE to the child list */ 69 struct list_head child_list; /* Link PE to the child list */
@@ -190,6 +192,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
190struct eeh_pe *eeh_pe_get(struct eeh_dev *edev); 192struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
191int eeh_add_to_parent_pe(struct eeh_dev *edev); 193int eeh_add_to_parent_pe(struct eeh_dev *edev);
192int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe); 194int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe);
195void eeh_pe_update_time_stamp(struct eeh_pe *pe);
193void *eeh_pe_dev_traverse(struct eeh_pe *root, 196void *eeh_pe_dev_traverse(struct eeh_pe *root,
194 eeh_traverse_func fn, void *flag); 197 eeh_traverse_func fn, void *flag);
195void eeh_pe_restore_bars(struct eeh_pe *pe); 198void eeh_pe_restore_bars(struct eeh_pe *pe);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index fb927af9a9ef..678bc6cddf82 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata)
349 */ 349 */
350static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) 350static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
351{ 351{
352 struct timeval tstamp;
352 int cnt, rc; 353 int cnt, rc;
353 354
354 /* pcibios will clear the counter; save the value */ 355 /* pcibios will clear the counter; save the value */
355 cnt = pe->freeze_count; 356 cnt = pe->freeze_count;
357 tstamp = pe->tstamp;
356 358
357 /* 359 /*
358 * We don't remove the corresponding PE instances because 360 * We don't remove the corresponding PE instances because
@@ -385,6 +387,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
385 ssleep(5); 387 ssleep(5);
386 pcibios_add_pci_devices(bus); 388 pcibios_add_pci_devices(bus);
387 } 389 }
390
391 pe->tstamp = tstamp;
388 pe->freeze_count = cnt; 392 pe->freeze_count = cnt;
389 393
390 return 0; 394 return 0;
@@ -425,6 +429,7 @@ void eeh_handle_event(struct eeh_pe *pe)
425 return; 429 return;
426 } 430 }
427 431
432 eeh_pe_update_time_stamp(pe);
428 pe->freeze_count++; 433 pe->freeze_count++;
429 if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) 434 if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
430 goto excess_failures; 435 goto excess_failures;
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index c96366758acf..ae75722c0583 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -482,6 +482,33 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
482} 482}
483 483
484/** 484/**
485 * eeh_pe_update_time_stamp - Update PE's frozen time stamp
486 * @pe: EEH PE
487 *
488 * We have time stamp for each PE to trace its time of getting
489 * frozen in last hour. The function should be called to update
490 * the time stamp on first error of the specific PE. On the other
491 * handle, we needn't account for errors happened in last hour.
492 */
493void eeh_pe_update_time_stamp(struct eeh_pe *pe)
494{
495 struct timeval tstamp;
496
497 if (!pe) return;
498
499 if (pe->freeze_count <= 0) {
500 pe->freeze_count = 0;
501 do_gettimeofday(&pe->tstamp);
502 } else {
503 do_gettimeofday(&tstamp);
504 if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
505 pe->tstamp = tstamp;
506 pe->freeze_count = 0;
507 }
508 }
509}
510
511/**
485 * __eeh_pe_state_mark - Mark the state for the PE 512 * __eeh_pe_state_mark - Mark the state for the PE
486 * @data: EEH PE 513 * @data: EEH PE
487 * @flag: state 514 * @flag: state