aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Hutchings <bhutchings@solarflare.com>2009-03-04 05:01:57 -0500
committerDavid S. Miller <davem@davemloft.net>2009-03-04 20:40:25 -0500
commit2c3c3d02f28801d7ad2da4952b2c7ca6621ef221 (patch)
treeb733d0bbcd79e59235f47f6f9bf3b981e26683f0
parent4720bc6cfe70b606cf62a244c7a5391e59923e45 (diff)
sfc: Improve NIC internal error recovery
Make the error count a per-NIC variable. Reset this the count after an hour if it has not reached the critical value. Set the critical value back to 5. Signed-off-by: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/sfc/falcon.c23
1 files changed, 19 insertions, 4 deletions
diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c
index 82c10f4de1b..2ae51fd6f9c 100644
--- a/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@ -39,11 +39,16 @@
39 * @next_buffer_table: First available buffer table id 39 * @next_buffer_table: First available buffer table id
40 * @pci_dev2: The secondary PCI device if present 40 * @pci_dev2: The secondary PCI device if present
41 * @i2c_data: Operations and state for I2C bit-bashing algorithm 41 * @i2c_data: Operations and state for I2C bit-bashing algorithm
42 * @int_error_count: Number of internal errors seen recently
43 * @int_error_expire: Time at which error count will be expired
42 */ 44 */
43struct falcon_nic_data { 45struct falcon_nic_data {
44 unsigned next_buffer_table; 46 unsigned next_buffer_table;
45 struct pci_dev *pci_dev2; 47 struct pci_dev *pci_dev2;
46 struct i2c_algo_bit_data i2c_data; 48 struct i2c_algo_bit_data i2c_data;
49
50 unsigned int_error_count;
51 unsigned long int_error_expire;
47}; 52};
48 53
49/************************************************************************** 54/**************************************************************************
@@ -119,8 +124,12 @@ MODULE_PARM_DESC(rx_xon_thresh_bytes, "RX fifo XON threshold");
119#define FALCON_EVQ_SIZE 4096 124#define FALCON_EVQ_SIZE 4096
120#define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1) 125#define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1)
121 126
122/* Max number of internal errors. After this resets will not be performed */ 127/* If FALCON_MAX_INT_ERRORS internal errors occur within
123#define FALCON_MAX_INT_ERRORS 4 128 * FALCON_INT_ERROR_EXPIRE seconds, we consider the NIC broken and
129 * disable it.
130 */
131#define FALCON_INT_ERROR_EXPIRE 3600
132#define FALCON_MAX_INT_ERRORS 5
124 133
125/* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times 134/* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times
126 */ 135 */
@@ -1374,7 +1383,6 @@ static irqreturn_t falcon_fatal_interrupt(struct efx_nic *efx)
1374 efx_oword_t *int_ker = efx->irq_status.addr; 1383 efx_oword_t *int_ker = efx->irq_status.addr;
1375 efx_oword_t fatal_intr; 1384 efx_oword_t fatal_intr;
1376 int error, mem_perr; 1385 int error, mem_perr;
1377 static int n_int_errors;
1378 1386
1379 falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER); 1387 falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER);
1380 error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR); 1388 error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR);
@@ -1401,7 +1409,14 @@ static irqreturn_t falcon_fatal_interrupt(struct efx_nic *efx)
1401 pci_clear_master(nic_data->pci_dev2); 1409 pci_clear_master(nic_data->pci_dev2);
1402 falcon_disable_interrupts(efx); 1410 falcon_disable_interrupts(efx);
1403 1411
1404 if (++n_int_errors < FALCON_MAX_INT_ERRORS) { 1412 /* Count errors and reset or disable the NIC accordingly */
1413 if (nic_data->int_error_count == 0 ||
1414 time_after(jiffies, nic_data->int_error_expire)) {
1415 nic_data->int_error_count = 0;
1416 nic_data->int_error_expire =
1417 jiffies + FALCON_INT_ERROR_EXPIRE * HZ;
1418 }
1419 if (++nic_data->int_error_count < FALCON_MAX_INT_ERRORS) {
1405 EFX_ERR(efx, "SYSTEM ERROR - reset scheduled\n"); 1420 EFX_ERR(efx, "SYSTEM ERROR - reset scheduled\n");
1406 efx_schedule_reset(efx, RESET_TYPE_INT_ERROR); 1421 efx_schedule_reset(efx, RESET_TYPE_INT_ERROR);
1407 } else { 1422 } else {