diff options
author | Ben Hutchings <bhutchings@solarflare.com> | 2009-03-04 05:01:57 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-03-04 20:40:25 -0500 |
commit | 2c3c3d02f28801d7ad2da4952b2c7ca6621ef221 (patch) | |
tree | b733d0bbcd79e59235f47f6f9bf3b981e26683f0 /drivers/net/sfc | |
parent | 4720bc6cfe70b606cf62a244c7a5391e59923e45 (diff) |
sfc: Improve NIC internal error recovery
Make the error count a per-NIC variable.
Reset this the count after an hour if it has not reached the critical value.
Set the critical value back to 5.
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/sfc')
-rw-r--r-- | drivers/net/sfc/falcon.c | 23 |
1 files changed, 19 insertions, 4 deletions
diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c index 82c10f4de1b8..2ae51fd6f9c1 100644 --- a/drivers/net/sfc/falcon.c +++ b/drivers/net/sfc/falcon.c | |||
@@ -39,11 +39,16 @@ | |||
39 | * @next_buffer_table: First available buffer table id | 39 | * @next_buffer_table: First available buffer table id |
40 | * @pci_dev2: The secondary PCI device if present | 40 | * @pci_dev2: The secondary PCI device if present |
41 | * @i2c_data: Operations and state for I2C bit-bashing algorithm | 41 | * @i2c_data: Operations and state for I2C bit-bashing algorithm |
42 | * @int_error_count: Number of internal errors seen recently | ||
43 | * @int_error_expire: Time at which error count will be expired | ||
42 | */ | 44 | */ |
43 | struct falcon_nic_data { | 45 | struct falcon_nic_data { |
44 | unsigned next_buffer_table; | 46 | unsigned next_buffer_table; |
45 | struct pci_dev *pci_dev2; | 47 | struct pci_dev *pci_dev2; |
46 | struct i2c_algo_bit_data i2c_data; | 48 | struct i2c_algo_bit_data i2c_data; |
49 | |||
50 | unsigned int_error_count; | ||
51 | unsigned long int_error_expire; | ||
47 | }; | 52 | }; |
48 | 53 | ||
49 | /************************************************************************** | 54 | /************************************************************************** |
@@ -119,8 +124,12 @@ MODULE_PARM_DESC(rx_xon_thresh_bytes, "RX fifo XON threshold"); | |||
119 | #define FALCON_EVQ_SIZE 4096 | 124 | #define FALCON_EVQ_SIZE 4096 |
120 | #define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1) | 125 | #define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1) |
121 | 126 | ||
122 | /* Max number of internal errors. After this resets will not be performed */ | 127 | /* If FALCON_MAX_INT_ERRORS internal errors occur within |
123 | #define FALCON_MAX_INT_ERRORS 4 | 128 | * FALCON_INT_ERROR_EXPIRE seconds, we consider the NIC broken and |
129 | * disable it. | ||
130 | */ | ||
131 | #define FALCON_INT_ERROR_EXPIRE 3600 | ||
132 | #define FALCON_MAX_INT_ERRORS 5 | ||
124 | 133 | ||
125 | /* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times | 134 | /* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times |
126 | */ | 135 | */ |
@@ -1374,7 +1383,6 @@ static irqreturn_t falcon_fatal_interrupt(struct efx_nic *efx) | |||
1374 | efx_oword_t *int_ker = efx->irq_status.addr; | 1383 | efx_oword_t *int_ker = efx->irq_status.addr; |
1375 | efx_oword_t fatal_intr; | 1384 | efx_oword_t fatal_intr; |
1376 | int error, mem_perr; | 1385 | int error, mem_perr; |
1377 | static int n_int_errors; | ||
1378 | 1386 | ||
1379 | falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER); | 1387 | falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER); |
1380 | error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR); | 1388 | error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR); |
@@ -1401,7 +1409,14 @@ static irqreturn_t falcon_fatal_interrupt(struct efx_nic *efx) | |||
1401 | pci_clear_master(nic_data->pci_dev2); | 1409 | pci_clear_master(nic_data->pci_dev2); |
1402 | falcon_disable_interrupts(efx); | 1410 | falcon_disable_interrupts(efx); |
1403 | 1411 | ||
1404 | if (++n_int_errors < FALCON_MAX_INT_ERRORS) { | 1412 | /* Count errors and reset or disable the NIC accordingly */ |
1413 | if (nic_data->int_error_count == 0 || | ||
1414 | time_after(jiffies, nic_data->int_error_expire)) { | ||
1415 | nic_data->int_error_count = 0; | ||
1416 | nic_data->int_error_expire = | ||
1417 | jiffies + FALCON_INT_ERROR_EXPIRE * HZ; | ||
1418 | } | ||
1419 | if (++nic_data->int_error_count < FALCON_MAX_INT_ERRORS) { | ||
1405 | EFX_ERR(efx, "SYSTEM ERROR - reset scheduled\n"); | 1420 | EFX_ERR(efx, "SYSTEM ERROR - reset scheduled\n"); |
1406 | efx_schedule_reset(efx, RESET_TYPE_INT_ERROR); | 1421 | efx_schedule_reset(efx, RESET_TYPE_INT_ERROR); |
1407 | } else { | 1422 | } else { |