aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel/eeh_pe.c
diff options
context:
space:
mode:
authorGavin Shan <gwshan@linux.vnet.ibm.com>2014-04-24 04:00:19 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2014-04-28 03:34:32 -0400
commitd2b0f6f77ee525811b6efe864efa6a4eb82eea73 (patch)
tree84205706f9cc2e03425ba3a48edf2a1d527e3267 /arch/powerpc/kernel/eeh_pe.c
parent7f52a526f64c69c913f0027fbf43821ff0b3a7d7 (diff)
powerpc/eeh: No hotplug on permanently removed dev
The issue was detected in a bit complicated test case where we have multiple hierarchical PEs shown as following figure: +-----------------+ | PE#3 p2p#0 | | p2p#1 | +-----------------+ | +-----------------+ | PE#4 pdev#0 | | pdev#1 | +-----------------+ PE#4 (have 2 PCI devices) is the child of PE#3, which has 2 p2p bridges. We accidentally had less-known scenario: PE#4 was removed permanently from the system because of permanent failure (e.g. exceeding the max allowd failure times in last hour), then we detects EEH errors on PE#3 and tried to recover it. However, eeh_dev instances for pdev#0/1 were not detached from PE#4, which was still connected to PE#3. All of that was because of the fact that we rely on count-based pcibios_release_device(), which isn't reliable enough. When doing recovery for PE#3, we still apply hotplug on PE#4 and pdev#0/1, which are not valid any more. Eventually, we run into kernel crash. The patch fixes above issue from two aspects. For unplug, we simply skip those permanently removed PE, whose state is (EEH_PE_STATE_ISOLATED && !EEH_PE_STATE_RECOVERING) and its frozen count should be greater than EEH_MAX_ALLOWED_FREEZES. For plug, we marked all permanently removed EEH devices with EEH_DEV_REMOVED and return 0xFF's on read its PCI config so that PCI core will omit them. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/kernel/eeh_pe.c')
-rw-r--r--arch/powerpc/kernel/eeh_pe.c47
1 files changed, 40 insertions, 7 deletions
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index f0c353fa655a..995c2a284630 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -503,13 +503,17 @@ static void *__eeh_pe_state_mark(void *data, void *flag)
503 struct eeh_dev *edev, *tmp; 503 struct eeh_dev *edev, *tmp;
504 struct pci_dev *pdev; 504 struct pci_dev *pdev;
505 505
506 /* 506 /* Keep the state of permanently removed PE intact */
507 * Mark the PE with the indicated state. Also, 507 if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
508 * the associated PCI device will be put into 508 (state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING)))
509 * I/O frozen state to avoid I/O accesses from 509 return NULL;
510 * the PCI device driver. 510
511 */
512 pe->state |= state; 511 pe->state |= state;
512
513 /* Offline PCI devices if applicable */
514 if (state != EEH_PE_ISOLATED)
515 return NULL;
516
513 eeh_pe_for_each_dev(pe, edev, tmp) { 517 eeh_pe_for_each_dev(pe, edev, tmp) {
514 pdev = eeh_dev_to_pci_dev(edev); 518 pdev = eeh_dev_to_pci_dev(edev);
515 if (pdev) 519 if (pdev)
@@ -532,6 +536,27 @@ void eeh_pe_state_mark(struct eeh_pe *pe, int state)
532 eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); 536 eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
533} 537}
534 538
539static void *__eeh_pe_dev_mode_mark(void *data, void *flag)
540{
541 struct eeh_dev *edev = data;
542 int mode = *((int *)flag);
543
544 edev->mode |= mode;
545
546 return NULL;
547}
548
549/**
550 * eeh_pe_dev_state_mark - Mark state for all device under the PE
551 * @pe: EEH PE
552 *
553 * Mark specific state for all child devices of the PE.
554 */
555void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode)
556{
557 eeh_pe_dev_traverse(pe, __eeh_pe_dev_mode_mark, &mode);
558}
559
535/** 560/**
536 * __eeh_pe_state_clear - Clear state for the PE 561 * __eeh_pe_state_clear - Clear state for the PE
537 * @data: EEH PE 562 * @data: EEH PE
@@ -546,8 +571,16 @@ static void *__eeh_pe_state_clear(void *data, void *flag)
546 struct eeh_pe *pe = (struct eeh_pe *)data; 571 struct eeh_pe *pe = (struct eeh_pe *)data;
547 int state = *((int *)flag); 572 int state = *((int *)flag);
548 573
574 /* Keep the state of permanently removed PE intact */
575 if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
576 (state & EEH_PE_ISOLATED))
577 return NULL;
578
549 pe->state &= ~state; 579 pe->state &= ~state;
550 pe->check_count = 0; 580
581 /* Clear check count since last isolation */
582 if (state & EEH_PE_ISOLATED)
583 pe->check_count = 0;
551 584
552 return NULL; 585 return NULL;
553} 586}