aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjorn Helgaas <bhelgaas@google.com>2018-08-15 15:58:45 -0400
committerBjorn Helgaas <bhelgaas@google.com>2018-08-15 15:58:45 -0400
commit3c3ab37f4c03dc9c7c917ff3c1e71d6da81d3bd3 (patch)
treee5b3368d2615f937f928656387de24e4a3b86b1c
parentaf863d18a1fbedd164366d0f3d4946d9cc3edc46 (diff)
parent45687f96c112adda2f1d1f05b977661eb00d5a1c (diff)
Merge branch 'pci/aer'
- Decode AER errors with names similar to "lspci" (Tyler Baicar) - Expose AER statistics in sysfs (Rajat Jain) - Clear AER status bits selectively based on the type of recovery (Oza Pawandeep) - Honor "pcie_ports=native" even if HEST sets FIRMWARE_FIRST (Alexandru Gagniuc) - Don't clear AER status bits if we're using the "Firmware-First" strategy where firmware owns the registers (Alexandru Gagniuc) * pci/aer: PCI/AER: Don't clear AER bits if error handling is Firmware-First PCI/AER: Remove duplicate PCI_EXP_AER_FLAGS definition PCI/portdrv: Remove pcie_portdrv_err_handler.slot_reset PCI/AER: Clear device status bits during ERR_COR handling PCI/AER: Clear device status bits during ERR_FATAL and ERR_NONFATAL PCI/AER: Remove ERR_FATAL code from ERR_NONFATAL path PCI/AER: Factor out ERR_NONFATAL status bit clearing PCI/AER: Clear only ERR_NONFATAL bits during non-fatal recovery PCI/AER: Clear only ERR_FATAL status bits during fatal recovery PCI/AER: Honor "pcie_ports=native" even if HEST sets FIRMWARE_FIRST PCI/AER: Add sysfs attributes for rootport cumulative stats PCI/AER: Add sysfs attributes to provide AER stats and breakdown PCI/AER: Define aer_stats structure for AER capable devices PCI/AER: Move internal declarations to drivers/pci/pci.h PCI/AER: Adopt lspci names for AER error decoding PCI/AER: Expose internal API for obtaining AER information # Conflicts: # drivers/pci/pci.h
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats122
-rw-r--r--Documentation/PCI/pcieaer-howto.txt5
-rw-r--r--drivers/pci/pci-sysfs.c3
-rw-r--r--drivers/pci/pci.h43
-rw-r--r--drivers/pci/pcie/aer.c336
-rw-r--r--drivers/pci/pcie/err.c15
-rw-r--r--drivers/pci/pcie/portdrv_pci.c25
-rw-r--r--drivers/pci/probe.c1
-rw-r--r--include/linux/pci.h5
9 files changed, 449 insertions, 106 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
new file mode 100644
index 000000000000..4b0318c99507
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
@@ -0,0 +1,122 @@
1==========================
2PCIe Device AER statistics
3==========================
4These attributes show up under all the devices that are AER capable. These
5statistical counters indicate the errors "as seen/reported by the device".
6Note that this may mean that if an endpoint is causing problems, the AER
7counters may increment at its link partner (e.g. root port) because the
8errors may be "seen" / reported by the link partner and not the
9problematic endpoint itself (which may report all counters as 0 as it never
10saw any problems).
11
12Where: /sys/bus/pci/devices/<dev>/aer_dev_correctable
13Date: July 2018
14Kernel Version: 4.19.0
15Contact: linux-pci@vger.kernel.org, rajatja@google.com
16Description: List of correctable errors seen and reported by this
17 PCI device using ERR_COR. Note that since multiple errors may
18 be reported using a single ERR_COR message, thus
19 TOTAL_ERR_COR at the end of the file may not match the actual
20 total of all the errors in the file. Sample output:
21-------------------------------------------------------------------------
22localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_correctable
23Receiver Error 2
24Bad TLP 0
25Bad DLLP 0
26RELAY_NUM Rollover 0
27Replay Timer Timeout 0
28Advisory Non-Fatal 0
29Corrected Internal Error 0
30Header Log Overflow 0
31TOTAL_ERR_COR 2
32-------------------------------------------------------------------------
33
34Where: /sys/bus/pci/devices/<dev>/aer_dev_fatal
35Date: July 2018
36Kernel Version: 4.19.0
37Contact: linux-pci@vger.kernel.org, rajatja@google.com
38Description: List of uncorrectable fatal errors seen and reported by this
39 PCI device using ERR_FATAL. Note that since multiple errors may
40 be reported using a single ERR_FATAL message, thus
41 TOTAL_ERR_FATAL at the end of the file may not match the actual
42 total of all the errors in the file. Sample output:
43-------------------------------------------------------------------------
44localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_fatal
45Undefined 0
46Data Link Protocol 0
47Surprise Down Error 0
48Poisoned TLP 0
49Flow Control Protocol 0
50Completion Timeout 0
51Completer Abort 0
52Unexpected Completion 0
53Receiver Overflow 0
54Malformed TLP 0
55ECRC 0
56Unsupported Request 0
57ACS Violation 0
58Uncorrectable Internal Error 0
59MC Blocked TLP 0
60AtomicOp Egress Blocked 0
61TLP Prefix Blocked Error 0
62TOTAL_ERR_FATAL 0
63-------------------------------------------------------------------------
64
65Where: /sys/bus/pci/devices/<dev>/aer_dev_nonfatal
66Date: July 2018
67Kernel Version: 4.19.0
68Contact: linux-pci@vger.kernel.org, rajatja@google.com
69Description: List of uncorrectable nonfatal errors seen and reported by this
70 PCI device using ERR_NONFATAL. Note that since multiple errors
71 may be reported using a single ERR_FATAL message, thus
72 TOTAL_ERR_NONFATAL at the end of the file may not match the
73 actual total of all the errors in the file. Sample output:
74-------------------------------------------------------------------------
75localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_nonfatal
76Undefined 0
77Data Link Protocol 0
78Surprise Down Error 0
79Poisoned TLP 0
80Flow Control Protocol 0
81Completion Timeout 0
82Completer Abort 0
83Unexpected Completion 0
84Receiver Overflow 0
85Malformed TLP 0
86ECRC 0
87Unsupported Request 0
88ACS Violation 0
89Uncorrectable Internal Error 0
90MC Blocked TLP 0
91AtomicOp Egress Blocked 0
92TLP Prefix Blocked Error 0
93TOTAL_ERR_NONFATAL 0
94-------------------------------------------------------------------------
95
96============================
97PCIe Rootport AER statistics
98============================
99These attributes show up under only the rootports (or root complex event
100collectors) that are AER capable. These indicate the number of error messages as
101"reported to" the rootport. Please note that the rootports also transmit
102(internally) the ERR_* messages for errors seen by the internal rootport PCI
103device, so these counters include them and are thus cumulative of all the error
104messages on the PCI hierarchy originating at that root port.
105
106Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
107Date: July 2018
108Kernel Version: 4.19.0
109Contact: linux-pci@vger.kernel.org, rajatja@google.com
110Description: Total number of ERR_COR messages reported to rootport.
111
112Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
113Date: July 2018
114Kernel Version: 4.19.0
115Contact: linux-pci@vger.kernel.org, rajatja@google.com
116Description: Total number of ERR_FATAL messages reported to rootport.
117
118Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
119Date: July 2018
120Kernel Version: 4.19.0
121Contact: linux-pci@vger.kernel.org, rajatja@google.com
122Description: Total number of ERR_NONFATAL messages reported to rootport.
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index acd0dddd6bb8..48ce7903e3c6 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -73,6 +73,11 @@ In the example, 'Requester ID' means the ID of the device who sends
73the error message to root port. Pls. refer to pci express specs for 73the error message to root port. Pls. refer to pci express specs for
74other fields. 74other fields.
75 75
762.4 AER Statistics / Counters
77
78When PCIe AER errors are captured, the counters / statistics are also exposed
79in the form of sysfs attributes which are documented at
80Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
76 81
773. Developer Guide 823. Developer Guide
78 83
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 0c4653c1d2ce..9f1cb9051d7d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1746,6 +1746,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
1746#endif 1746#endif
1747 &pci_bridge_attr_group, 1747 &pci_bridge_attr_group,
1748 &pcie_dev_attr_group, 1748 &pcie_dev_attr_group,
1749#ifdef CONFIG_PCIEAER
1750 &aer_stats_attr_group,
1751#endif
1749 NULL, 1752 NULL,
1750}; 1753};
1751 1754
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 08817253c8a2..3ac0d99afe67 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -311,6 +311,34 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev)
311 return test_bit(PCI_DEV_ADDED, &dev->priv_flags); 311 return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
312} 312}
313 313
314#ifdef CONFIG_PCIEAER
315#include <linux/aer.h>
316
317#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
318
319struct aer_err_info {
320 struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
321 int error_dev_num;
322
323 unsigned int id:16;
324
325 unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
326 unsigned int __pad1:5;
327 unsigned int multi_error_valid:1;
328
329 unsigned int first_error:5;
330 unsigned int __pad2:2;
331 unsigned int tlp_header_valid:1;
332
333 unsigned int status; /* COR/UNCOR Error Status */
334 unsigned int mask; /* COR/UNCOR Error Mask */
335 struct aer_header_log_regs tlp; /* TLP Header */
336};
337
338int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
339void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
340#endif /* CONFIG_PCIEAER */
341
314#ifdef CONFIG_PCI_ATS 342#ifdef CONFIG_PCI_ATS
315void pci_restore_ats_state(struct pci_dev *dev); 343void pci_restore_ats_state(struct pci_dev *dev);
316#else 344#else
@@ -467,4 +495,19 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev,
467} 495}
468#endif 496#endif
469 497
498#ifdef CONFIG_PCIEAER
499void pci_no_aer(void);
500void pci_aer_init(struct pci_dev *dev);
501void pci_aer_exit(struct pci_dev *dev);
502extern const struct attribute_group aer_stats_attr_group;
503void pci_aer_clear_fatal_status(struct pci_dev *dev);
504void pci_aer_clear_device_status(struct pci_dev *dev);
505#else
506static inline void pci_no_aer(void) { }
507static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
508static inline void pci_aer_exit(struct pci_dev *d) { }
509static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
510static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
511#endif
512
470#endif /* DRIVERS_PCI_H */ 513#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a2e88386af28..4e823ae051a7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -31,26 +31,9 @@
31#include "portdrv.h" 31#include "portdrv.h"
32 32
33#define AER_ERROR_SOURCES_MAX 100 33#define AER_ERROR_SOURCES_MAX 100
34#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
35 34
36struct aer_err_info { 35#define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */
37 struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; 36#define AER_MAX_TYPEOF_UNCOR_ERRS 26 /* as per PCI_ERR_UNCOR_STATUS*/
38 int error_dev_num;
39
40 unsigned int id:16;
41
42 unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
43 unsigned int __pad1:5;
44 unsigned int multi_error_valid:1;
45
46 unsigned int first_error:5;
47 unsigned int __pad2:2;
48 unsigned int tlp_header_valid:1;
49
50 unsigned int status; /* COR/UNCOR Error Status */
51 unsigned int mask; /* COR/UNCOR Error Mask */
52 struct aer_header_log_regs tlp; /* TLP Header */
53};
54 37
55struct aer_err_source { 38struct aer_err_source {
56 unsigned int status; 39 unsigned int status;
@@ -76,6 +59,42 @@ struct aer_rpc {
76 */ 59 */
77}; 60};
78 61
62/* AER stats for the device */
63struct aer_stats {
64
65 /*
66 * Fields for all AER capable devices. They indicate the errors
67 * "as seen by this device". Note that this may mean that if an
68 * end point is causing problems, the AER counters may increment
69 * at its link partner (e.g. root port) because the errors will be
70 * "seen" by the link partner and not the the problematic end point
71 * itself (which may report all counters as 0 as it never saw any
72 * problems).
73 */
74 /* Counters for different type of correctable errors */
75 u64 dev_cor_errs[AER_MAX_TYPEOF_COR_ERRS];
76 /* Counters for different type of fatal uncorrectable errors */
77 u64 dev_fatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
78 /* Counters for different type of nonfatal uncorrectable errors */
79 u64 dev_nonfatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
80 /* Total number of ERR_COR sent by this device */
81 u64 dev_total_cor_errs;
82 /* Total number of ERR_FATAL sent by this device */
83 u64 dev_total_fatal_errs;
84 /* Total number of ERR_NONFATAL sent by this device */
85 u64 dev_total_nonfatal_errs;
86
87 /*
88 * Fields for Root ports & root complex event collectors only, these
89 * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL
90 * messages received by the root port / event collector, INCLUDING the
91 * ones that are generated internally (by the rootport itself)
92 */
93 u64 rootport_total_cor_errs;
94 u64 rootport_total_fatal_errs;
95 u64 rootport_total_nonfatal_errs;
96};
97
79#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ 98#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
80 PCI_ERR_UNC_ECRC| \ 99 PCI_ERR_UNC_ECRC| \
81 PCI_ERR_UNC_UNSUP| \ 100 PCI_ERR_UNC_UNSUP| \
@@ -303,12 +322,13 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev)
303 if (!pci_is_pcie(dev)) 322 if (!pci_is_pcie(dev))
304 return 0; 323 return 0;
305 324
325 if (pcie_ports_native)
326 return 0;
327
306 if (!dev->__aer_firmware_first_valid) 328 if (!dev->__aer_firmware_first_valid)
307 aer_set_firmware_first(dev); 329 aer_set_firmware_first(dev);
308 return dev->__aer_firmware_first; 330 return dev->__aer_firmware_first;
309} 331}
310#define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
311 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
312 332
313static bool aer_firmware_first; 333static bool aer_firmware_first;
314 334
@@ -323,6 +343,9 @@ bool aer_acpi_firmware_first(void)
323 .firmware_first = 0, 343 .firmware_first = 0,
324 }; 344 };
325 345
346 if (pcie_ports_native)
347 return false;
348
326 if (!parsed) { 349 if (!parsed) {
327 apei_hest_parse(aer_hest_parse, &info); 350 apei_hest_parse(aer_hest_parse, &info);
328 aer_firmware_first = info.firmware_first; 351 aer_firmware_first = info.firmware_first;
@@ -357,16 +380,30 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
357} 380}
358EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting); 381EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
359 382
383void pci_aer_clear_device_status(struct pci_dev *dev)
384{
385 u16 sta;
386
387 pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
388 pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
389}
390
360int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) 391int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
361{ 392{
362 int pos; 393 int pos;
363 u32 status; 394 u32 status, sev;
364 395
365 pos = dev->aer_cap; 396 pos = dev->aer_cap;
366 if (!pos) 397 if (!pos)
367 return -EIO; 398 return -EIO;
368 399
400 if (pcie_aer_get_firmware_first(dev))
401 return -EIO;
402
403 /* Clear status bits for ERR_NONFATAL errors only */
369 pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); 404 pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
405 pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
406 status &= ~sev;
370 if (status) 407 if (status)
371 pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status); 408 pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
372 409
@@ -374,6 +411,26 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
374} 411}
375EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status); 412EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
376 413
414void pci_aer_clear_fatal_status(struct pci_dev *dev)
415{
416 int pos;
417 u32 status, sev;
418
419 pos = dev->aer_cap;
420 if (!pos)
421 return;
422
423 if (pcie_aer_get_firmware_first(dev))
424 return;
425
426 /* Clear status bits for ERR_FATAL errors only */
427 pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
428 pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
429 status &= sev;
430 if (status)
431 pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
432}
433
377int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) 434int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
378{ 435{
379 int pos; 436 int pos;
@@ -387,6 +444,9 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
387 if (!pos) 444 if (!pos)
388 return -EIO; 445 return -EIO;
389 446
447 if (pcie_aer_get_firmware_first(dev))
448 return -EIO;
449
390 port_type = pci_pcie_type(dev); 450 port_type = pci_pcie_type(dev);
391 if (port_type == PCI_EXP_TYPE_ROOT_PORT) { 451 if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
392 pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status); 452 pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
@@ -402,10 +462,20 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
402 return 0; 462 return 0;
403} 463}
404 464
405int pci_aer_init(struct pci_dev *dev) 465void pci_aer_init(struct pci_dev *dev)
406{ 466{
407 dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); 467 dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
408 return pci_cleanup_aer_error_status_regs(dev); 468
469 if (dev->aer_cap)
470 dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
471
472 pci_cleanup_aer_error_status_regs(dev);
473}
474
475void pci_aer_exit(struct pci_dev *dev)
476{
477 kfree(dev->aer_stats);
478 dev->aer_stats = NULL;
409} 479}
410 480
411#define AER_AGENT_RECEIVER 0 481#define AER_AGENT_RECEIVER 0
@@ -458,52 +528,52 @@ static const char *aer_error_layer[] = {
458 "Transaction Layer" 528 "Transaction Layer"
459}; 529};
460 530
461static const char *aer_correctable_error_string[] = { 531static const char *aer_correctable_error_string[AER_MAX_TYPEOF_COR_ERRS] = {
462 "Receiver Error", /* Bit Position 0 */ 532 "RxErr", /* Bit Position 0 */
463 NULL, 533 NULL,
464 NULL, 534 NULL,
465 NULL, 535 NULL,
466 NULL, 536 NULL,
467 NULL, 537 NULL,
468 "Bad TLP", /* Bit Position 6 */ 538 "BadTLP", /* Bit Position 6 */
469 "Bad DLLP", /* Bit Position 7 */ 539 "BadDLLP", /* Bit Position 7 */
470 "RELAY_NUM Rollover", /* Bit Position 8 */ 540 "Rollover", /* Bit Position 8 */
471 NULL, 541 NULL,
472 NULL, 542 NULL,
473 NULL, 543 NULL,
474 "Replay Timer Timeout", /* Bit Position 12 */ 544 "Timeout", /* Bit Position 12 */
475 "Advisory Non-Fatal", /* Bit Position 13 */ 545 "NonFatalErr", /* Bit Position 13 */
476 "Corrected Internal Error", /* Bit Position 14 */ 546 "CorrIntErr", /* Bit Position 14 */
477 "Header Log Overflow", /* Bit Position 15 */ 547 "HeaderOF", /* Bit Position 15 */
478}; 548};
479 549
480static const char *aer_uncorrectable_error_string[] = { 550static const char *aer_uncorrectable_error_string[AER_MAX_TYPEOF_UNCOR_ERRS] = {
481 "Undefined", /* Bit Position 0 */ 551 "Undefined", /* Bit Position 0 */
482 NULL, 552 NULL,
483 NULL, 553 NULL,
484 NULL, 554 NULL,
485 "Data Link Protocol", /* Bit Position 4 */ 555 "DLP", /* Bit Position 4 */
486 "Surprise Down Error", /* Bit Position 5 */ 556 "SDES", /* Bit Position 5 */
487 NULL, 557 NULL,
488 NULL, 558 NULL,
489 NULL, 559 NULL,
490 NULL, 560 NULL,
491 NULL, 561 NULL,
492 NULL, 562 NULL,
493 "Poisoned TLP", /* Bit Position 12 */ 563 "TLP", /* Bit Position 12 */
494 "Flow Control Protocol", /* Bit Position 13 */ 564 "FCP", /* Bit Position 13 */
495 "Completion Timeout", /* Bit Position 14 */ 565 "CmpltTO", /* Bit Position 14 */
496 "Completer Abort", /* Bit Position 15 */ 566 "CmpltAbrt", /* Bit Position 15 */
497 "Unexpected Completion", /* Bit Position 16 */ 567 "UnxCmplt", /* Bit Position 16 */
498 "Receiver Overflow", /* Bit Position 17 */ 568 "RxOF", /* Bit Position 17 */
499 "Malformed TLP", /* Bit Position 18 */ 569 "MalfTLP", /* Bit Position 18 */
500 "ECRC", /* Bit Position 19 */ 570 "ECRC", /* Bit Position 19 */
501 "Unsupported Request", /* Bit Position 20 */ 571 "UnsupReq", /* Bit Position 20 */
502 "ACS Violation", /* Bit Position 21 */ 572 "ACSViol", /* Bit Position 21 */
503 "Uncorrectable Internal Error", /* Bit Position 22 */ 573 "UncorrIntErr", /* Bit Position 22 */
504 "MC Blocked TLP", /* Bit Position 23 */ 574 "BlockedTLP", /* Bit Position 23 */
505 "AtomicOp Egress Blocked", /* Bit Position 24 */ 575 "AtomicOpBlocked", /* Bit Position 24 */
506 "TLP Prefix Blocked Error", /* Bit Position 25 */ 576 "TLPBlockedErr", /* Bit Position 25 */
507}; 577};
508 578
509static const char *aer_agent_string[] = { 579static const char *aer_agent_string[] = {
@@ -513,6 +583,144 @@ static const char *aer_agent_string[] = {
513 "Transmitter ID" 583 "Transmitter ID"
514}; 584};
515 585
586#define aer_stats_dev_attr(name, stats_array, strings_array, \
587 total_string, total_field) \
588 static ssize_t \
589 name##_show(struct device *dev, struct device_attribute *attr, \
590 char *buf) \
591{ \
592 unsigned int i; \
593 char *str = buf; \
594 struct pci_dev *pdev = to_pci_dev(dev); \
595 u64 *stats = pdev->aer_stats->stats_array; \
596 \
597 for (i = 0; i < ARRAY_SIZE(strings_array); i++) { \
598 if (strings_array[i]) \
599 str += sprintf(str, "%s %llu\n", \
600 strings_array[i], stats[i]); \
601 else if (stats[i]) \
602 str += sprintf(str, #stats_array "_bit[%d] %llu\n",\
603 i, stats[i]); \
604 } \
605 str += sprintf(str, "TOTAL_%s %llu\n", total_string, \
606 pdev->aer_stats->total_field); \
607 return str-buf; \
608} \
609static DEVICE_ATTR_RO(name)
610
611aer_stats_dev_attr(aer_dev_correctable, dev_cor_errs,
612 aer_correctable_error_string, "ERR_COR",
613 dev_total_cor_errs);
614aer_stats_dev_attr(aer_dev_fatal, dev_fatal_errs,
615 aer_uncorrectable_error_string, "ERR_FATAL",
616 dev_total_fatal_errs);
617aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
618 aer_uncorrectable_error_string, "ERR_NONFATAL",
619 dev_total_nonfatal_errs);
620
621#define aer_stats_rootport_attr(name, field) \
622 static ssize_t \
623 name##_show(struct device *dev, struct device_attribute *attr, \
624 char *buf) \
625{ \
626 struct pci_dev *pdev = to_pci_dev(dev); \
627 return sprintf(buf, "%llu\n", pdev->aer_stats->field); \
628} \
629static DEVICE_ATTR_RO(name)
630
631aer_stats_rootport_attr(aer_rootport_total_err_cor,
632 rootport_total_cor_errs);
633aer_stats_rootport_attr(aer_rootport_total_err_fatal,
634 rootport_total_fatal_errs);
635aer_stats_rootport_attr(aer_rootport_total_err_nonfatal,
636 rootport_total_nonfatal_errs);
637
638static struct attribute *aer_stats_attrs[] __ro_after_init = {
639 &dev_attr_aer_dev_correctable.attr,
640 &dev_attr_aer_dev_fatal.attr,
641 &dev_attr_aer_dev_nonfatal.attr,
642 &dev_attr_aer_rootport_total_err_cor.attr,
643 &dev_attr_aer_rootport_total_err_fatal.attr,
644 &dev_attr_aer_rootport_total_err_nonfatal.attr,
645 NULL
646};
647
648static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
649 struct attribute *a, int n)
650{
651 struct device *dev = kobj_to_dev(kobj);
652 struct pci_dev *pdev = to_pci_dev(dev);
653
654 if (!pdev->aer_stats)
655 return 0;
656
657 if ((a == &dev_attr_aer_rootport_total_err_cor.attr ||
658 a == &dev_attr_aer_rootport_total_err_fatal.attr ||
659 a == &dev_attr_aer_rootport_total_err_nonfatal.attr) &&
660 pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT)
661 return 0;
662
663 return a->mode;
664}
665
666const struct attribute_group aer_stats_attr_group = {
667 .attrs = aer_stats_attrs,
668 .is_visible = aer_stats_attrs_are_visible,
669};
670
671static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
672 struct aer_err_info *info)
673{
674 int status, i, max = -1;
675 u64 *counter = NULL;
676 struct aer_stats *aer_stats = pdev->aer_stats;
677
678 if (!aer_stats)
679 return;
680
681 switch (info->severity) {
682 case AER_CORRECTABLE:
683 aer_stats->dev_total_cor_errs++;
684 counter = &aer_stats->dev_cor_errs[0];
685 max = AER_MAX_TYPEOF_COR_ERRS;
686 break;
687 case AER_NONFATAL:
688 aer_stats->dev_total_nonfatal_errs++;
689 counter = &aer_stats->dev_nonfatal_errs[0];
690 max = AER_MAX_TYPEOF_UNCOR_ERRS;
691 break;
692 case AER_FATAL:
693 aer_stats->dev_total_fatal_errs++;
694 counter = &aer_stats->dev_fatal_errs[0];
695 max = AER_MAX_TYPEOF_UNCOR_ERRS;
696 break;
697 }
698
699 status = (info->status & ~info->mask);
700 for (i = 0; i < max; i++)
701 if (status & (1 << i))
702 counter[i]++;
703}
704
705static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
706 struct aer_err_source *e_src)
707{
708 struct aer_stats *aer_stats = pdev->aer_stats;
709
710 if (!aer_stats)
711 return;
712
713 if (e_src->status & PCI_ERR_ROOT_COR_RCV)
714 aer_stats->rootport_total_cor_errs++;
715
716 if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
717 if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
718 aer_stats->rootport_total_fatal_errs++;
719 else
720 aer_stats->rootport_total_nonfatal_errs++;
721 }
722}
723
516static void __print_tlp_header(struct pci_dev *dev, 724static void __print_tlp_header(struct pci_dev *dev,
517 struct aer_header_log_regs *t) 725 struct aer_header_log_regs *t)
518{ 726{
@@ -545,9 +753,10 @@ static void __aer_print_error(struct pci_dev *dev,
545 pci_err(dev, " [%2d] Unknown Error Bit%s\n", 753 pci_err(dev, " [%2d] Unknown Error Bit%s\n",
546 i, info->first_error == i ? " (First)" : ""); 754 i, info->first_error == i ? " (First)" : "");
547 } 755 }
756 pci_dev_aer_stats_incr(dev, info);
548} 757}
549 758
550static void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) 759void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
551{ 760{
552 int layer, agent; 761 int layer, agent;
553 int id = ((dev->bus->number << 8) | dev->devfn); 762 int id = ((dev->bus->number << 8) | dev->devfn);
@@ -799,6 +1008,7 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
799 if (pos) 1008 if (pos)
800 pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, 1009 pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
801 info->status); 1010 info->status);
1011 pci_aer_clear_device_status(dev);
802 } else if (info->severity == AER_NONFATAL) 1012 } else if (info->severity == AER_NONFATAL)
803 pcie_do_nonfatal_recovery(dev); 1013 pcie_do_nonfatal_recovery(dev);
804 else if (info->severity == AER_FATAL) 1014 else if (info->severity == AER_FATAL)
@@ -876,7 +1086,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
876#endif 1086#endif
877 1087
878/** 1088/**
879 * get_device_error_info - read error status from dev and store it to info 1089 * aer_get_device_error_info - read error status from dev and store it to info
880 * @dev: pointer to the device expected to have a error record 1090 * @dev: pointer to the device expected to have a error record
881 * @info: pointer to structure to store the error record 1091 * @info: pointer to structure to store the error record
882 * 1092 *
@@ -884,7 +1094,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
884 * 1094 *
885 * Note that @info is reused among all error devices. Clear fields properly. 1095 * Note that @info is reused among all error devices. Clear fields properly.
886 */ 1096 */
887static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) 1097int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
888{ 1098{
889 int pos, temp; 1099 int pos, temp;
890 1100
@@ -942,11 +1152,11 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
942 1152
943 /* Report all before handle them, not to lost records by reset etc. */ 1153 /* Report all before handle them, not to lost records by reset etc. */
944 for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { 1154 for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
945 if (get_device_error_info(e_info->dev[i], e_info)) 1155 if (aer_get_device_error_info(e_info->dev[i], e_info))
946 aer_print_error(e_info->dev[i], e_info); 1156 aer_print_error(e_info->dev[i], e_info);
947 } 1157 }
948 for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { 1158 for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
949 if (get_device_error_info(e_info->dev[i], e_info)) 1159 if (aer_get_device_error_info(e_info->dev[i], e_info))
950 handle_error_source(e_info->dev[i], e_info); 1160 handle_error_source(e_info->dev[i], e_info);
951 } 1161 }
952} 1162}
@@ -962,6 +1172,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
962 struct pci_dev *pdev = rpc->rpd; 1172 struct pci_dev *pdev = rpc->rpd;
963 struct aer_err_info *e_info = &rpc->e_info; 1173 struct aer_err_info *e_info = &rpc->e_info;
964 1174
1175 pci_rootport_aer_stats_incr(pdev, e_src);
1176
965 /* 1177 /*
966 * There is a possibility that both correctable error and 1178 * There is a possibility that both correctable error and
967 * uncorrectable error being logged. Report correctable error first. 1179 * uncorrectable error being logged. Report correctable error first.
@@ -1336,20 +1548,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
1336 */ 1548 */
1337static void aer_error_resume(struct pci_dev *dev) 1549static void aer_error_resume(struct pci_dev *dev)
1338{ 1550{
1339 int pos; 1551 pci_aer_clear_device_status(dev);
1340 u32 status, mask; 1552 pci_cleanup_aer_uncorrect_error_status(dev);
1341 u16 reg16;
1342
1343 /* Clean up Root device status */
1344 pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &reg16);
1345 pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
1346
1347 /* Clean AER Root Error Status */
1348 pos = dev->aer_cap;
1349 pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
1350 pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &mask);
1351 status &= ~mask; /* Clear corresponding nonfatal bits */
1352 pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
1353} 1553}
1354 1554
1355static struct pcie_port_service_driver aerdriver = { 1555static struct pcie_port_service_driver aerdriver = {
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f02e334beb45..674984a9277a 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
252 dev->error_state = state; 252 dev->error_state = state;
253 pci_walk_bus(dev->subordinate, cb, &result_data); 253 pci_walk_bus(dev->subordinate, cb, &result_data);
254 if (cb == report_resume) { 254 if (cb == report_resume) {
255 pci_aer_clear_device_status(dev);
255 pci_cleanup_aer_uncorrect_error_status(dev); 256 pci_cleanup_aer_uncorrect_error_status(dev);
256 dev->error_state = pci_channel_io_normal; 257 dev->error_state = pci_channel_io_normal;
257 } 258 }
@@ -259,15 +260,10 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
259 /* 260 /*
260 * If the error is reported by an end point, we think this 261 * If the error is reported by an end point, we think this
261 * error is related to the upstream link of the end point. 262 * error is related to the upstream link of the end point.
263 * The error is non fatal so the bus is ok; just invoke
264 * the callback for the function that logged the error.
262 */ 265 */
263 if (state == pci_channel_io_normal) 266 cb(dev, &result_data);
264 /*
265 * the error is non fatal so the bus is ok, just invoke
266 * the callback for the function that logged the error.
267 */
268 cb(dev, &result_data);
269 else
270 pci_walk_bus(dev->bus, cb, &result_data);
271 } 267 }
272 268
273 return result_data.result; 269 return result_data.result;
@@ -317,7 +313,8 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
317 * do error recovery on all subordinates of the bridge instead 313 * do error recovery on all subordinates of the bridge instead
318 * of the bridge and clear the error status of the bridge. 314 * of the bridge and clear the error status of the bridge.
319 */ 315 */
320 pci_cleanup_aer_uncorrect_error_status(dev); 316 pci_aer_clear_fatal_status(dev);
317 pci_aer_clear_device_status(dev);
321 } 318 }
322 319
323 if (result == PCI_ERS_RESULT_RECOVERED) { 320 if (result == PCI_ERS_RESULT_RECOVERED) {
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 973f1b80a038..b78840f54a9b 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup);
42 42
43/* global data */ 43/* global data */
44 44
45static int pcie_portdrv_restore_config(struct pci_dev *dev)
46{
47 int retval;
48
49 retval = pci_enable_device(dev);
50 if (retval)
51 return retval;
52 pci_set_master(dev);
53 return 0;
54}
55
56#ifdef CONFIG_PM 45#ifdef CONFIG_PM
57static int pcie_port_runtime_suspend(struct device *dev) 46static int pcie_port_runtime_suspend(struct device *dev)
58{ 47{
@@ -160,19 +149,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev)
160 return PCI_ERS_RESULT_RECOVERED; 149 return PCI_ERS_RESULT_RECOVERED;
161} 150}
162 151
163static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
164{
165 /* If fatal, restore cfg space for possible link reset at upstream */
166 if (dev->error_state == pci_channel_io_frozen) {
167 dev->state_saved = true;
168 pci_restore_state(dev);
169 pcie_portdrv_restore_config(dev);
170 pci_enable_pcie_error_reporting(dev);
171 }
172
173 return PCI_ERS_RESULT_RECOVERED;
174}
175
176static int resume_iter(struct device *device, void *data) 152static int resume_iter(struct device *device, void *data)
177{ 153{
178 struct pcie_device *pcie_device; 154 struct pcie_device *pcie_device;
@@ -208,7 +184,6 @@ static const struct pci_device_id port_pci_ids[] = { {
208static const struct pci_error_handlers pcie_portdrv_err_handler = { 184static const struct pci_error_handlers pcie_portdrv_err_handler = {
209 .error_detected = pcie_portdrv_error_detected, 185 .error_detected = pcie_portdrv_error_detected,
210 .mmio_enabled = pcie_portdrv_mmio_enabled, 186 .mmio_enabled = pcie_portdrv_mmio_enabled,
211 .slot_reset = pcie_portdrv_slot_reset,
212 .resume = pcie_portdrv_err_resume, 187 .resume = pcie_portdrv_err_resume,
213}; 188};
214 189
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 611adcd9c169..9472da27e202 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev)
2064 2064
2065static void pci_release_capabilities(struct pci_dev *dev) 2065static void pci_release_capabilities(struct pci_dev *dev)
2066{ 2066{
2067 pci_aer_exit(dev);
2067 pci_vpd_release(dev); 2068 pci_vpd_release(dev);
2068 pci_iov_release(dev); 2069 pci_iov_release(dev);
2069 pci_free_cap_save_buffers(dev); 2070 pci_free_cap_save_buffers(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c133ccfa002e..d78f46f070c2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -299,6 +299,7 @@ struct pci_dev {
299 u8 hdr_type; /* PCI header type (`multi' flag masked out) */ 299 u8 hdr_type; /* PCI header type (`multi' flag masked out) */
300#ifdef CONFIG_PCIEAER 300#ifdef CONFIG_PCIEAER
301 u16 aer_cap; /* AER capability offset */ 301 u16 aer_cap; /* AER capability offset */
302 struct aer_stats *aer_stats; /* AER stats for this device */
302#endif 303#endif
303 u8 pcie_cap; /* PCIe capability offset */ 304 u8 pcie_cap; /* PCIe capability offset */
304 u8 msi_cap; /* MSI capability offset */ 305 u8 msi_cap; /* MSI capability offset */
@@ -1469,13 +1470,9 @@ static inline bool pcie_aspm_support_enabled(void) { return false; }
1469#endif 1470#endif
1470 1471
1471#ifdef CONFIG_PCIEAER 1472#ifdef CONFIG_PCIEAER
1472void pci_no_aer(void);
1473bool pci_aer_available(void); 1473bool pci_aer_available(void);
1474int pci_aer_init(struct pci_dev *dev);
1475#else 1474#else
1476static inline void pci_no_aer(void) { }
1477static inline bool pci_aer_available(void) { return false; } 1475static inline bool pci_aer_available(void) { return false; }
1478static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
1479#endif 1476#endif
1480 1477
1481#ifdef CONFIG_PCIE_ECRC 1478#ifdef CONFIG_PCIE_ECRC