aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZhang, Yanmin <yanmin_zhang@linux.intel.com>2009-06-16 01:35:11 -0400
committerJesse Barnes <jbarnes@virtuousgeek.org>2009-06-16 17:30:13 -0400
commit28eb27cf0839a30948335f9b2edda739f48b7a2e (patch)
tree3278825cdfc730c4b44fcf18a41a2d96180030a3
parent70298c6e6c1ba68346336b4ea54bd5c0abbf73c8 (diff)
PCI AER: support invalid error source IDs
When the bus id part of error source id is equal to 0 or nosourceid=1, make the kernel probe the AER status registers of all devices under the root port to find the initial error reporter. Reviewed-by: Andrew Patterson <andrew.patterson@hp.com> Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
-rw-r--r--Documentation/PCI/pcieaer-howto.txt4
-rw-r--r--drivers/pci/pcie/aer/aerdrv.h2
-rw-r--r--drivers/pci/pcie/aer/aerdrv_core.c176
3 files changed, 122 insertions, 60 deletions
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index f6b1ba7464dc..5408b9b39d89 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -61,6 +61,10 @@ be initiated although firmwares have no _OSC support. To enable the
61walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line 61walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line
62when booting kernel. Note that forceload=n by default. 62when booting kernel. Note that forceload=n by default.
63 63
64nosourceid, another parameter of type bool, can be used when broken
65hardware (mostly chipsets) has root ports that cannot obtain the reporting
66source ID. nosourceid=n by default.
67
642.3 AER error output 682.3 AER error output
65When a PCI-E AER error is captured, an error message will be outputed to 69When a PCI-E AER error is captured, an error message will be outputed to
66console. If it's a correctable error, it is outputed as a warning. 70console. If it's a correctable error, it is outputed as a warning.
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 3a69ddefe361..dadf492e9ce9 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -58,6 +58,8 @@ struct header_log_regs {
58}; 58};
59 59
60struct aer_err_info { 60struct aer_err_info {
61 struct pci_dev *dev;
62 u16 id;
61 int severity; /* 0:NONFATAL | 1:FATAL | 2:COR */ 63 int severity; /* 0:NONFATAL | 1:FATAL | 2:COR */
62 int flags; 64 int flags;
63 unsigned int status; /* COR/UNCOR Error Status */ 65 unsigned int status; /* COR/UNCOR Error Status */
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index a7a3919904bb..2750e7b266b4 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -26,7 +26,9 @@
26#include "aerdrv.h" 26#include "aerdrv.h"
27 27
28static int forceload; 28static int forceload;
29static int nosourceid;
29module_param(forceload, bool, 0); 30module_param(forceload, bool, 0);
31module_param(nosourceid, bool, 0);
30 32
31int pci_enable_pcie_error_reporting(struct pci_dev *dev) 33int pci_enable_pcie_error_reporting(struct pci_dev *dev)
32{ 34{
@@ -143,34 +145,87 @@ static void set_downstream_devices_error_reporting(struct pci_dev *dev,
143 pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable); 145 pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable);
144} 146}
145 147
146static int find_device_iter(struct device *device, void *data) 148static inline int compare_device_id(struct pci_dev *dev,
149 struct aer_err_info *e_info)
147{ 150{
148 struct pci_dev *dev; 151 if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) {
149 u16 id = *(unsigned long *)data; 152 /*
150 u8 secondary, subordinate, d_bus = id >> 8; 153 * Device ID match
154 */
155 return 1;
156 }
151 157
152 if (device->bus == &pci_bus_type) { 158 return 0;
153 dev = to_pci_dev(device); 159}
154 if (id == ((dev->bus->number << 8) | dev->devfn)) { 160
155 /* 161#define PCI_BUS(x) (((x) >> 8) & 0xff)
156 * Device ID match 162
157 */ 163static int find_device_iter(struct pci_dev *dev, void *data)
158 *(unsigned long*)data = (unsigned long)device; 164{
165 int pos;
166 u32 status;
167 u32 mask;
168 u16 reg16;
169 int result;
170 struct aer_err_info *e_info = (struct aer_err_info *)data;
171
172 /*
173 * When bus id is equal to 0, it might be a bad id
174 * reported by root port.
175 */
176 if (!nosourceid && (PCI_BUS(e_info->id) != 0)) {
177 result = compare_device_id(dev, e_info);
178 if (result)
179 e_info->dev = dev;
180 return result;
181 }
182
183 /*
184 * Next is to check when bus id is equal to 0 or
185 * nosourceid==y. Some ports might lose the bus
186 * id of error source id. We check AER status
187 * registers to find the initial reporter.
188 */
189 if (atomic_read(&dev->enable_cnt) == 0)
190 return 0;
191 pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
192 if (!pos)
193 return 0;
194 /* Check if AER is enabled */
195 pci_read_config_word(dev, pos+PCI_EXP_DEVCTL, &reg16);
196 if (!(reg16 & (
197 PCI_EXP_DEVCTL_CERE |
198 PCI_EXP_DEVCTL_NFERE |
199 PCI_EXP_DEVCTL_FERE |
200 PCI_EXP_DEVCTL_URRE)))
201 return 0;
202 pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
203 if (!pos)
204 return 0;
205
206 status = 0;
207 mask = 0;
208 if (e_info->severity == AER_CORRECTABLE) {
209 pci_read_config_dword(dev,
210 pos + PCI_ERR_COR_STATUS,
211 &status);
212 pci_read_config_dword(dev,
213 pos + PCI_ERR_COR_MASK,
214 &mask);
215 if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) {
216 e_info->dev = dev;
159 return 1; 217 return 1;
160 } 218 }
161 219 } else {
162 /* 220 pci_read_config_dword(dev,
163 * If device is P2P, check if it is an upstream? 221 pos + PCI_ERR_UNCOR_STATUS,
164 */ 222 &status);
165 if (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE) { 223 pci_read_config_dword(dev,
166 pci_read_config_byte(dev, PCI_SECONDARY_BUS, 224 pos + PCI_ERR_UNCOR_MASK,
167 &secondary); 225 &mask);
168 pci_read_config_byte(dev, PCI_SUBORDINATE_BUS, 226 if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) {
169 &subordinate); 227 e_info->dev = dev;
170 if (d_bus >= secondary && d_bus <= subordinate) { 228 return 1;
171 *(unsigned long*)data = (unsigned long)device;
172 return 1;
173 }
174 } 229 }
175 } 230 }
176 231
@@ -180,33 +235,22 @@ static int find_device_iter(struct device *device, void *data)
180/** 235/**
181 * find_source_device - search through device hierarchy for source device 236 * find_source_device - search through device hierarchy for source device
182 * @parent: pointer to Root Port pci_dev data structure 237 * @parent: pointer to Root Port pci_dev data structure
183 * @id: device ID of agent who sends an error message to this Root Port 238 * @err_info: including detailed error information such like id
184 * 239 *
185 * Invoked when error is detected at the Root Port. 240 * Invoked when error is detected at the Root Port.
186 */ 241 */
187static struct device* find_source_device(struct pci_dev *parent, u16 id) 242static void find_source_device(struct pci_dev *parent,
243 struct aer_err_info *e_info)
188{ 244{
189 struct pci_dev *dev = parent; 245 struct pci_dev *dev = parent;
190 struct device *device; 246 int result;
191 unsigned long device_addr;
192 int status;
193 247
194 /* Is Root Port an agent that sends error message? */ 248 /* Is Root Port an agent that sends error message? */
195 if (id == ((dev->bus->number << 8) | dev->devfn)) 249 result = find_device_iter(dev, e_info);
196 return &dev->dev; 250 if (result)
197 251 return;
198 do {
199 device_addr = id;
200 if ((status = device_for_each_child(&dev->dev,
201 &device_addr, find_device_iter))) {
202 device = (struct device*)device_addr;
203 dev = to_pci_dev(device);
204 if (id == ((dev->bus->number << 8) | dev->devfn))
205 return device;
206 }
207 }while (status);
208 252
209 return NULL; 253 pci_walk_bus(parent->subordinate, find_device_iter, e_info);
210} 254}
211 255
212static int report_error_detected(struct pci_dev *dev, void *data) 256static int report_error_detected(struct pci_dev *dev, void *data)
@@ -501,12 +545,12 @@ static pci_ers_result_t do_recovery(struct pcie_device *aerdev,
501 */ 545 */
502static void handle_error_source(struct pcie_device * aerdev, 546static void handle_error_source(struct pcie_device * aerdev,
503 struct pci_dev *dev, 547 struct pci_dev *dev,
504 struct aer_err_info info) 548 struct aer_err_info *info)
505{ 549{
506 pci_ers_result_t status = 0; 550 pci_ers_result_t status = 0;
507 int pos; 551 int pos;
508 552
509 if (info.severity == AER_CORRECTABLE) { 553 if (info->severity == AER_CORRECTABLE) {
510 /* 554 /*
511 * Correctable error does not need software intevention. 555 * Correctable error does not need software intevention.
512 * No need to go through error recovery process. 556 * No need to go through error recovery process.
@@ -514,9 +558,9 @@ static void handle_error_source(struct pcie_device * aerdev,
514 pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); 558 pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
515 if (pos) 559 if (pos)
516 pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, 560 pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
517 info.status); 561 info->status);
518 } else { 562 } else {
519 status = do_recovery(aerdev, dev, info.severity); 563 status = do_recovery(aerdev, dev, info->severity);
520 if (status == PCI_ERS_RESULT_RECOVERED) { 564 if (status == PCI_ERS_RESULT_RECOVERED) {
521 dev_printk(KERN_DEBUG, &dev->dev, "AER driver " 565 dev_printk(KERN_DEBUG, &dev->dev, "AER driver "
522 "successfully recovered\n"); 566 "successfully recovered\n");
@@ -673,10 +717,16 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
673static void aer_isr_one_error(struct pcie_device *p_device, 717static void aer_isr_one_error(struct pcie_device *p_device,
674 struct aer_err_source *e_src) 718 struct aer_err_source *e_src)
675{ 719{
676 struct device *s_device; 720 struct aer_err_info *e_info;
677 struct aer_err_info e_info = {0, 0, 0,};
678 int i; 721 int i;
679 u16 id; 722
723 /* struct aer_err_info might be big, so we allocate it with slab */
724 e_info = kmalloc(sizeof(struct aer_err_info), GFP_KERNEL);
725 if (e_info == NULL) {
726 dev_printk(KERN_DEBUG, &p_device->port->dev,
727 "Can't allocate mem when processing AER errors\n");
728 return;
729 }
680 730
681 /* 731 /*
682 * There is a possibility that both correctable error and 732 * There is a possibility that both correctable error and
@@ -688,31 +738,37 @@ static void aer_isr_one_error(struct pcie_device *p_device,
688 if (!(e_src->status & i)) 738 if (!(e_src->status & i))
689 continue; 739 continue;
690 740
741 memset(e_info, 0, sizeof(struct aer_err_info));
742
691 /* Init comprehensive error information */ 743 /* Init comprehensive error information */
692 if (i & PCI_ERR_ROOT_COR_RCV) { 744 if (i & PCI_ERR_ROOT_COR_RCV) {
693 id = ERR_COR_ID(e_src->id); 745 e_info->id = ERR_COR_ID(e_src->id);
694 e_info.severity = AER_CORRECTABLE; 746 e_info->severity = AER_CORRECTABLE;
695 } else { 747 } else {
696 id = ERR_UNCOR_ID(e_src->id); 748 e_info->id = ERR_UNCOR_ID(e_src->id);
697 e_info.severity = ((e_src->status >> 6) & 1); 749 e_info->severity = ((e_src->status >> 6) & 1);
698 } 750 }
699 if (e_src->status & 751 if (e_src->status &
700 (PCI_ERR_ROOT_MULTI_COR_RCV | 752 (PCI_ERR_ROOT_MULTI_COR_RCV |
701 PCI_ERR_ROOT_MULTI_UNCOR_RCV)) 753 PCI_ERR_ROOT_MULTI_UNCOR_RCV))
702 e_info.flags |= AER_MULTI_ERROR_VALID_FLAG; 754 e_info->flags |= AER_MULTI_ERROR_VALID_FLAG;
703 if (!(s_device = find_source_device(p_device->port, id))) { 755
756 find_source_device(p_device->port, e_info);
757 if (e_info->dev == NULL) {
704 printk(KERN_DEBUG "%s->can't find device of ID%04x\n", 758 printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
705 __func__, id); 759 __func__, e_info->id);
706 continue; 760 continue;
707 } 761 }
708 if (get_device_error_info(to_pci_dev(s_device), &e_info) == 762 if (get_device_error_info(e_info->dev, e_info) ==
709 AER_SUCCESS) { 763 AER_SUCCESS) {
710 aer_print_error(to_pci_dev(s_device), &e_info); 764 aer_print_error(e_info->dev, e_info);
711 handle_error_source(p_device, 765 handle_error_source(p_device,
712 to_pci_dev(s_device), 766 e_info->dev,
713 e_info); 767 e_info);
714 } 768 }
715 } 769 }
770
771 kfree(e_info);
716} 772}
717 773
718/** 774/**