diff options
author | Zhang, Yanmin <yanmin_zhang@linux.intel.com> | 2009-06-16 01:35:11 -0400 |
---|---|---|
committer | Jesse Barnes <jbarnes@virtuousgeek.org> | 2009-06-16 17:30:13 -0400 |
commit | 28eb27cf0839a30948335f9b2edda739f48b7a2e (patch) | |
tree | 3278825cdfc730c4b44fcf18a41a2d96180030a3 | |
parent | 70298c6e6c1ba68346336b4ea54bd5c0abbf73c8 (diff) |
PCI AER: support invalid error source IDs
When the bus id part of error source id is equal to 0 or nosourceid=1,
make the kernel probe the AER status registers of all devices under the
root port to find the initial error reporter.
Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
-rw-r--r-- | Documentation/PCI/pcieaer-howto.txt | 4 | ||||
-rw-r--r-- | drivers/pci/pcie/aer/aerdrv.h | 2 | ||||
-rw-r--r-- | drivers/pci/pcie/aer/aerdrv_core.c | 176 |
3 files changed, 122 insertions, 60 deletions
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt index f6b1ba7464dc..5408b9b39d89 100644 --- a/Documentation/PCI/pcieaer-howto.txt +++ b/Documentation/PCI/pcieaer-howto.txt | |||
@@ -61,6 +61,10 @@ be initiated although firmwares have no _OSC support. To enable the | |||
61 | walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line | 61 | walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line |
62 | when booting kernel. Note that forceload=n by default. | 62 | when booting kernel. Note that forceload=n by default. |
63 | 63 | ||
64 | nosourceid, another parameter of type bool, can be used when broken | ||
65 | hardware (mostly chipsets) has root ports that cannot obtain the reporting | ||
66 | source ID. nosourceid=n by default. | ||
67 | |||
64 | 2.3 AER error output | 68 | 2.3 AER error output |
65 | When a PCI-E AER error is captured, an error message will be outputed to | 69 | When a PCI-E AER error is captured, an error message will be outputed to |
66 | console. If it's a correctable error, it is outputed as a warning. | 70 | console. If it's a correctable error, it is outputed as a warning. |
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h index 3a69ddefe361..dadf492e9ce9 100644 --- a/drivers/pci/pcie/aer/aerdrv.h +++ b/drivers/pci/pcie/aer/aerdrv.h | |||
@@ -58,6 +58,8 @@ struct header_log_regs { | |||
58 | }; | 58 | }; |
59 | 59 | ||
60 | struct aer_err_info { | 60 | struct aer_err_info { |
61 | struct pci_dev *dev; | ||
62 | u16 id; | ||
61 | int severity; /* 0:NONFATAL | 1:FATAL | 2:COR */ | 63 | int severity; /* 0:NONFATAL | 1:FATAL | 2:COR */ |
62 | int flags; | 64 | int flags; |
63 | unsigned int status; /* COR/UNCOR Error Status */ | 65 | unsigned int status; /* COR/UNCOR Error Status */ |
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index a7a3919904bb..2750e7b266b4 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c | |||
@@ -26,7 +26,9 @@ | |||
26 | #include "aerdrv.h" | 26 | #include "aerdrv.h" |
27 | 27 | ||
28 | static int forceload; | 28 | static int forceload; |
29 | static int nosourceid; | ||
29 | module_param(forceload, bool, 0); | 30 | module_param(forceload, bool, 0); |
31 | module_param(nosourceid, bool, 0); | ||
30 | 32 | ||
31 | int pci_enable_pcie_error_reporting(struct pci_dev *dev) | 33 | int pci_enable_pcie_error_reporting(struct pci_dev *dev) |
32 | { | 34 | { |
@@ -143,34 +145,87 @@ static void set_downstream_devices_error_reporting(struct pci_dev *dev, | |||
143 | pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable); | 145 | pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable); |
144 | } | 146 | } |
145 | 147 | ||
146 | static int find_device_iter(struct device *device, void *data) | 148 | static inline int compare_device_id(struct pci_dev *dev, |
149 | struct aer_err_info *e_info) | ||
147 | { | 150 | { |
148 | struct pci_dev *dev; | 151 | if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) { |
149 | u16 id = *(unsigned long *)data; | 152 | /* |
150 | u8 secondary, subordinate, d_bus = id >> 8; | 153 | * Device ID match |
154 | */ | ||
155 | return 1; | ||
156 | } | ||
151 | 157 | ||
152 | if (device->bus == &pci_bus_type) { | 158 | return 0; |
153 | dev = to_pci_dev(device); | 159 | } |
154 | if (id == ((dev->bus->number << 8) | dev->devfn)) { | 160 | |
155 | /* | 161 | #define PCI_BUS(x) (((x) >> 8) & 0xff) |
156 | * Device ID match | 162 | |
157 | */ | 163 | static int find_device_iter(struct pci_dev *dev, void *data) |
158 | *(unsigned long*)data = (unsigned long)device; | 164 | { |
165 | int pos; | ||
166 | u32 status; | ||
167 | u32 mask; | ||
168 | u16 reg16; | ||
169 | int result; | ||
170 | struct aer_err_info *e_info = (struct aer_err_info *)data; | ||
171 | |||
172 | /* | ||
173 | * When bus id is equal to 0, it might be a bad id | ||
174 | * reported by root port. | ||
175 | */ | ||
176 | if (!nosourceid && (PCI_BUS(e_info->id) != 0)) { | ||
177 | result = compare_device_id(dev, e_info); | ||
178 | if (result) | ||
179 | e_info->dev = dev; | ||
180 | return result; | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * Next is to check when bus id is equal to 0 or | ||
185 | * nosourceid==y. Some ports might lose the bus | ||
186 | * id of error source id. We check AER status | ||
187 | * registers to find the initial reporter. | ||
188 | */ | ||
189 | if (atomic_read(&dev->enable_cnt) == 0) | ||
190 | return 0; | ||
191 | pos = pci_find_capability(dev, PCI_CAP_ID_EXP); | ||
192 | if (!pos) | ||
193 | return 0; | ||
194 | /* Check if AER is enabled */ | ||
195 | pci_read_config_word(dev, pos+PCI_EXP_DEVCTL, ®16); | ||
196 | if (!(reg16 & ( | ||
197 | PCI_EXP_DEVCTL_CERE | | ||
198 | PCI_EXP_DEVCTL_NFERE | | ||
199 | PCI_EXP_DEVCTL_FERE | | ||
200 | PCI_EXP_DEVCTL_URRE))) | ||
201 | return 0; | ||
202 | pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); | ||
203 | if (!pos) | ||
204 | return 0; | ||
205 | |||
206 | status = 0; | ||
207 | mask = 0; | ||
208 | if (e_info->severity == AER_CORRECTABLE) { | ||
209 | pci_read_config_dword(dev, | ||
210 | pos + PCI_ERR_COR_STATUS, | ||
211 | &status); | ||
212 | pci_read_config_dword(dev, | ||
213 | pos + PCI_ERR_COR_MASK, | ||
214 | &mask); | ||
215 | if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) { | ||
216 | e_info->dev = dev; | ||
159 | return 1; | 217 | return 1; |
160 | } | 218 | } |
161 | 219 | } else { | |
162 | /* | 220 | pci_read_config_dword(dev, |
163 | * If device is P2P, check if it is an upstream? | 221 | pos + PCI_ERR_UNCOR_STATUS, |
164 | */ | 222 | &status); |
165 | if (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE) { | 223 | pci_read_config_dword(dev, |
166 | pci_read_config_byte(dev, PCI_SECONDARY_BUS, | 224 | pos + PCI_ERR_UNCOR_MASK, |
167 | &secondary); | 225 | &mask); |
168 | pci_read_config_byte(dev, PCI_SUBORDINATE_BUS, | 226 | if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) { |
169 | &subordinate); | 227 | e_info->dev = dev; |
170 | if (d_bus >= secondary && d_bus <= subordinate) { | 228 | return 1; |
171 | *(unsigned long*)data = (unsigned long)device; | ||
172 | return 1; | ||
173 | } | ||
174 | } | 229 | } |
175 | } | 230 | } |
176 | 231 | ||
@@ -180,33 +235,22 @@ static int find_device_iter(struct device *device, void *data) | |||
180 | /** | 235 | /** |
181 | * find_source_device - search through device hierarchy for source device | 236 | * find_source_device - search through device hierarchy for source device |
182 | * @parent: pointer to Root Port pci_dev data structure | 237 | * @parent: pointer to Root Port pci_dev data structure |
183 | * @id: device ID of agent who sends an error message to this Root Port | 238 | * @err_info: including detailed error information such like id |
184 | * | 239 | * |
185 | * Invoked when error is detected at the Root Port. | 240 | * Invoked when error is detected at the Root Port. |
186 | */ | 241 | */ |
187 | static struct device* find_source_device(struct pci_dev *parent, u16 id) | 242 | static void find_source_device(struct pci_dev *parent, |
243 | struct aer_err_info *e_info) | ||
188 | { | 244 | { |
189 | struct pci_dev *dev = parent; | 245 | struct pci_dev *dev = parent; |
190 | struct device *device; | 246 | int result; |
191 | unsigned long device_addr; | ||
192 | int status; | ||
193 | 247 | ||
194 | /* Is Root Port an agent that sends error message? */ | 248 | /* Is Root Port an agent that sends error message? */ |
195 | if (id == ((dev->bus->number << 8) | dev->devfn)) | 249 | result = find_device_iter(dev, e_info); |
196 | return &dev->dev; | 250 | if (result) |
197 | 251 | return; | |
198 | do { | ||
199 | device_addr = id; | ||
200 | if ((status = device_for_each_child(&dev->dev, | ||
201 | &device_addr, find_device_iter))) { | ||
202 | device = (struct device*)device_addr; | ||
203 | dev = to_pci_dev(device); | ||
204 | if (id == ((dev->bus->number << 8) | dev->devfn)) | ||
205 | return device; | ||
206 | } | ||
207 | }while (status); | ||
208 | 252 | ||
209 | return NULL; | 253 | pci_walk_bus(parent->subordinate, find_device_iter, e_info); |
210 | } | 254 | } |
211 | 255 | ||
212 | static int report_error_detected(struct pci_dev *dev, void *data) | 256 | static int report_error_detected(struct pci_dev *dev, void *data) |
@@ -501,12 +545,12 @@ static pci_ers_result_t do_recovery(struct pcie_device *aerdev, | |||
501 | */ | 545 | */ |
502 | static void handle_error_source(struct pcie_device * aerdev, | 546 | static void handle_error_source(struct pcie_device * aerdev, |
503 | struct pci_dev *dev, | 547 | struct pci_dev *dev, |
504 | struct aer_err_info info) | 548 | struct aer_err_info *info) |
505 | { | 549 | { |
506 | pci_ers_result_t status = 0; | 550 | pci_ers_result_t status = 0; |
507 | int pos; | 551 | int pos; |
508 | 552 | ||
509 | if (info.severity == AER_CORRECTABLE) { | 553 | if (info->severity == AER_CORRECTABLE) { |
510 | /* | 554 | /* |
511 | * Correctable error does not need software intevention. | 555 | * Correctable error does not need software intevention. |
512 | * No need to go through error recovery process. | 556 | * No need to go through error recovery process. |
@@ -514,9 +558,9 @@ static void handle_error_source(struct pcie_device * aerdev, | |||
514 | pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); | 558 | pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); |
515 | if (pos) | 559 | if (pos) |
516 | pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, | 560 | pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, |
517 | info.status); | 561 | info->status); |
518 | } else { | 562 | } else { |
519 | status = do_recovery(aerdev, dev, info.severity); | 563 | status = do_recovery(aerdev, dev, info->severity); |
520 | if (status == PCI_ERS_RESULT_RECOVERED) { | 564 | if (status == PCI_ERS_RESULT_RECOVERED) { |
521 | dev_printk(KERN_DEBUG, &dev->dev, "AER driver " | 565 | dev_printk(KERN_DEBUG, &dev->dev, "AER driver " |
522 | "successfully recovered\n"); | 566 | "successfully recovered\n"); |
@@ -673,10 +717,16 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) | |||
673 | static void aer_isr_one_error(struct pcie_device *p_device, | 717 | static void aer_isr_one_error(struct pcie_device *p_device, |
674 | struct aer_err_source *e_src) | 718 | struct aer_err_source *e_src) |
675 | { | 719 | { |
676 | struct device *s_device; | 720 | struct aer_err_info *e_info; |
677 | struct aer_err_info e_info = {0, 0, 0,}; | ||
678 | int i; | 721 | int i; |
679 | u16 id; | 722 | |
723 | /* struct aer_err_info might be big, so we allocate it with slab */ | ||
724 | e_info = kmalloc(sizeof(struct aer_err_info), GFP_KERNEL); | ||
725 | if (e_info == NULL) { | ||
726 | dev_printk(KERN_DEBUG, &p_device->port->dev, | ||
727 | "Can't allocate mem when processing AER errors\n"); | ||
728 | return; | ||
729 | } | ||
680 | 730 | ||
681 | /* | 731 | /* |
682 | * There is a possibility that both correctable error and | 732 | * There is a possibility that both correctable error and |
@@ -688,31 +738,37 @@ static void aer_isr_one_error(struct pcie_device *p_device, | |||
688 | if (!(e_src->status & i)) | 738 | if (!(e_src->status & i)) |
689 | continue; | 739 | continue; |
690 | 740 | ||
741 | memset(e_info, 0, sizeof(struct aer_err_info)); | ||
742 | |||
691 | /* Init comprehensive error information */ | 743 | /* Init comprehensive error information */ |
692 | if (i & PCI_ERR_ROOT_COR_RCV) { | 744 | if (i & PCI_ERR_ROOT_COR_RCV) { |
693 | id = ERR_COR_ID(e_src->id); | 745 | e_info->id = ERR_COR_ID(e_src->id); |
694 | e_info.severity = AER_CORRECTABLE; | 746 | e_info->severity = AER_CORRECTABLE; |
695 | } else { | 747 | } else { |
696 | id = ERR_UNCOR_ID(e_src->id); | 748 | e_info->id = ERR_UNCOR_ID(e_src->id); |
697 | e_info.severity = ((e_src->status >> 6) & 1); | 749 | e_info->severity = ((e_src->status >> 6) & 1); |
698 | } | 750 | } |
699 | if (e_src->status & | 751 | if (e_src->status & |
700 | (PCI_ERR_ROOT_MULTI_COR_RCV | | 752 | (PCI_ERR_ROOT_MULTI_COR_RCV | |
701 | PCI_ERR_ROOT_MULTI_UNCOR_RCV)) | 753 | PCI_ERR_ROOT_MULTI_UNCOR_RCV)) |
702 | e_info.flags |= AER_MULTI_ERROR_VALID_FLAG; | 754 | e_info->flags |= AER_MULTI_ERROR_VALID_FLAG; |
703 | if (!(s_device = find_source_device(p_device->port, id))) { | 755 | |
756 | find_source_device(p_device->port, e_info); | ||
757 | if (e_info->dev == NULL) { | ||
704 | printk(KERN_DEBUG "%s->can't find device of ID%04x\n", | 758 | printk(KERN_DEBUG "%s->can't find device of ID%04x\n", |
705 | __func__, id); | 759 | __func__, e_info->id); |
706 | continue; | 760 | continue; |
707 | } | 761 | } |
708 | if (get_device_error_info(to_pci_dev(s_device), &e_info) == | 762 | if (get_device_error_info(e_info->dev, e_info) == |
709 | AER_SUCCESS) { | 763 | AER_SUCCESS) { |
710 | aer_print_error(to_pci_dev(s_device), &e_info); | 764 | aer_print_error(e_info->dev, e_info); |
711 | handle_error_source(p_device, | 765 | handle_error_source(p_device, |
712 | to_pci_dev(s_device), | 766 | e_info->dev, |
713 | e_info); | 767 | e_info); |
714 | } | 768 | } |
715 | } | 769 | } |
770 | |||
771 | kfree(e_info); | ||
716 | } | 772 | } |
717 | 773 | ||
718 | /** | 774 | /** |