aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/net/ethernet/sfc/efx.c268
-rw-r--r--drivers/net/ethernet/sfc/efx.h18
-rw-r--r--drivers/net/ethernet/sfc/enum.h12
-rw-r--r--drivers/net/ethernet/sfc/ethtool.c4
-rw-r--r--drivers/net/ethernet/sfc/falcon.c17
-rw-r--r--drivers/net/ethernet/sfc/filter.c249
-rw-r--r--drivers/net/ethernet/sfc/mcdi_pcol.h1
-rw-r--r--drivers/net/ethernet/sfc/net_driver.h97
-rw-r--r--drivers/net/ethernet/sfc/nic.c90
-rw-r--r--drivers/net/ethernet/sfc/ptp.c116
-rw-r--r--drivers/net/ethernet/sfc/rx.c793
-rw-r--r--drivers/net/ethernet/sfc/siena.c25
12 files changed, 1072 insertions, 618 deletions
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 0bc00991d310..f050248e9fba 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -21,7 +21,9 @@
21#include <linux/ethtool.h> 21#include <linux/ethtool.h>
22#include <linux/topology.h> 22#include <linux/topology.h>
23#include <linux/gfp.h> 23#include <linux/gfp.h>
24#include <linux/pci.h>
24#include <linux/cpu_rmap.h> 25#include <linux/cpu_rmap.h>
26#include <linux/aer.h>
25#include "net_driver.h" 27#include "net_driver.h"
26#include "efx.h" 28#include "efx.h"
27#include "nic.h" 29#include "nic.h"
@@ -71,21 +73,21 @@ const char *const efx_loopback_mode_names[] = {
71 73
72const unsigned int efx_reset_type_max = RESET_TYPE_MAX; 74const unsigned int efx_reset_type_max = RESET_TYPE_MAX;
73const char *const efx_reset_type_names[] = { 75const char *const efx_reset_type_names[] = {
74 [RESET_TYPE_INVISIBLE] = "INVISIBLE", 76 [RESET_TYPE_INVISIBLE] = "INVISIBLE",
75 [RESET_TYPE_ALL] = "ALL", 77 [RESET_TYPE_ALL] = "ALL",
76 [RESET_TYPE_WORLD] = "WORLD", 78 [RESET_TYPE_RECOVER_OR_ALL] = "RECOVER_OR_ALL",
77 [RESET_TYPE_DISABLE] = "DISABLE", 79 [RESET_TYPE_WORLD] = "WORLD",
78 [RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG", 80 [RESET_TYPE_RECOVER_OR_DISABLE] = "RECOVER_OR_DISABLE",
79 [RESET_TYPE_INT_ERROR] = "INT_ERROR", 81 [RESET_TYPE_DISABLE] = "DISABLE",
80 [RESET_TYPE_RX_RECOVERY] = "RX_RECOVERY", 82 [RESET_TYPE_TX_WATCHDOG] = "TX_WATCHDOG",
81 [RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH", 83 [RESET_TYPE_INT_ERROR] = "INT_ERROR",
82 [RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH", 84 [RESET_TYPE_RX_RECOVERY] = "RX_RECOVERY",
83 [RESET_TYPE_TX_SKIP] = "TX_SKIP", 85 [RESET_TYPE_RX_DESC_FETCH] = "RX_DESC_FETCH",
84 [RESET_TYPE_MC_FAILURE] = "MC_FAILURE", 86 [RESET_TYPE_TX_DESC_FETCH] = "TX_DESC_FETCH",
87 [RESET_TYPE_TX_SKIP] = "TX_SKIP",
88 [RESET_TYPE_MC_FAILURE] = "MC_FAILURE",
85}; 89};
86 90
87#define EFX_MAX_MTU (9 * 1024)
88
89/* Reset workqueue. If any NIC has a hardware failure then a reset will be 91/* Reset workqueue. If any NIC has a hardware failure then a reset will be
90 * queued onto this work queue. This is not a per-nic work queue, because 92 * queued onto this work queue. This is not a per-nic work queue, because
91 * efx_reset_work() acquires the rtnl lock, so resets are naturally serialised. 93 * efx_reset_work() acquires the rtnl lock, so resets are naturally serialised.
@@ -117,9 +119,12 @@ MODULE_PARM_DESC(separate_tx_channels,
117static int napi_weight = 64; 119static int napi_weight = 64;
118 120
119/* This is the time (in jiffies) between invocations of the hardware 121/* This is the time (in jiffies) between invocations of the hardware
120 * monitor. On Falcon-based NICs, this will: 122 * monitor.
123 * On Falcon-based NICs, this will:
121 * - Check the on-board hardware monitor; 124 * - Check the on-board hardware monitor;
122 * - Poll the link state and reconfigure the hardware as necessary. 125 * - Poll the link state and reconfigure the hardware as necessary.
126 * On Siena-based NICs for power systems with EEH support, this will give EEH a
127 * chance to start.
123 */ 128 */
124static unsigned int efx_monitor_interval = 1 * HZ; 129static unsigned int efx_monitor_interval = 1 * HZ;
125 130
@@ -203,13 +208,14 @@ static void efx_stop_all(struct efx_nic *efx);
203#define EFX_ASSERT_RESET_SERIALISED(efx) \ 208#define EFX_ASSERT_RESET_SERIALISED(efx) \
204 do { \ 209 do { \
205 if ((efx->state == STATE_READY) || \ 210 if ((efx->state == STATE_READY) || \
211 (efx->state == STATE_RECOVERY) || \
206 (efx->state == STATE_DISABLED)) \ 212 (efx->state == STATE_DISABLED)) \
207 ASSERT_RTNL(); \ 213 ASSERT_RTNL(); \
208 } while (0) 214 } while (0)
209 215
210static int efx_check_disabled(struct efx_nic *efx) 216static int efx_check_disabled(struct efx_nic *efx)
211{ 217{
212 if (efx->state == STATE_DISABLED) { 218 if (efx->state == STATE_DISABLED || efx->state == STATE_RECOVERY) {
213 netif_err(efx, drv, efx->net_dev, 219 netif_err(efx, drv, efx->net_dev,
214 "device is disabled due to earlier errors\n"); 220 "device is disabled due to earlier errors\n");
215 return -EIO; 221 return -EIO;
@@ -242,15 +248,9 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
242 struct efx_rx_queue *rx_queue = 248 struct efx_rx_queue *rx_queue =
243 efx_channel_get_rx_queue(channel); 249 efx_channel_get_rx_queue(channel);
244 250
245 /* Deliver last RX packet. */ 251 efx_rx_flush_packet(channel);
246 if (channel->rx_pkt) { 252 if (rx_queue->enabled)
247 __efx_rx_packet(channel, channel->rx_pkt);
248 channel->rx_pkt = NULL;
249 }
250 if (rx_queue->enabled) {
251 efx_rx_strategy(channel);
252 efx_fast_push_rx_descriptors(rx_queue); 253 efx_fast_push_rx_descriptors(rx_queue);
253 }
254 } 254 }
255 255
256 return spent; 256 return spent;
@@ -625,20 +625,51 @@ fail:
625 */ 625 */
626static void efx_start_datapath(struct efx_nic *efx) 626static void efx_start_datapath(struct efx_nic *efx)
627{ 627{
628 bool old_rx_scatter = efx->rx_scatter;
628 struct efx_tx_queue *tx_queue; 629 struct efx_tx_queue *tx_queue;
629 struct efx_rx_queue *rx_queue; 630 struct efx_rx_queue *rx_queue;
630 struct efx_channel *channel; 631 struct efx_channel *channel;
632 size_t rx_buf_len;
631 633
632 /* Calculate the rx buffer allocation parameters required to 634 /* Calculate the rx buffer allocation parameters required to
633 * support the current MTU, including padding for header 635 * support the current MTU, including padding for header
634 * alignment and overruns. 636 * alignment and overruns.
635 */ 637 */
636 efx->rx_buffer_len = (max(EFX_PAGE_IP_ALIGN, NET_IP_ALIGN) + 638 efx->rx_dma_len = (efx->type->rx_buffer_hash_size +
637 EFX_MAX_FRAME_LEN(efx->net_dev->mtu) + 639 EFX_MAX_FRAME_LEN(efx->net_dev->mtu) +
638 efx->type->rx_buffer_hash_size + 640 efx->type->rx_buffer_padding);
639 efx->type->rx_buffer_padding); 641 rx_buf_len = (sizeof(struct efx_rx_page_state) +
640 efx->rx_buffer_order = get_order(efx->rx_buffer_len + 642 EFX_PAGE_IP_ALIGN + efx->rx_dma_len);
641 sizeof(struct efx_rx_page_state)); 643 if (rx_buf_len <= PAGE_SIZE) {
644 efx->rx_scatter = false;
645 efx->rx_buffer_order = 0;
646 } else if (efx->type->can_rx_scatter) {
647 BUILD_BUG_ON(sizeof(struct efx_rx_page_state) +
648 EFX_PAGE_IP_ALIGN + EFX_RX_USR_BUF_SIZE >
649 PAGE_SIZE / 2);
650 efx->rx_scatter = true;
651 efx->rx_dma_len = EFX_RX_USR_BUF_SIZE;
652 efx->rx_buffer_order = 0;
653 } else {
654 efx->rx_scatter = false;
655 efx->rx_buffer_order = get_order(rx_buf_len);
656 }
657
658 efx_rx_config_page_split(efx);
659 if (efx->rx_buffer_order)
660 netif_dbg(efx, drv, efx->net_dev,
661 "RX buf len=%u; page order=%u batch=%u\n",
662 efx->rx_dma_len, efx->rx_buffer_order,
663 efx->rx_pages_per_batch);
664 else
665 netif_dbg(efx, drv, efx->net_dev,
666 "RX buf len=%u step=%u bpp=%u; page batch=%u\n",
667 efx->rx_dma_len, efx->rx_page_buf_step,
668 efx->rx_bufs_per_page, efx->rx_pages_per_batch);
669
670 /* RX filters also have scatter-enabled flags */
671 if (efx->rx_scatter != old_rx_scatter)
672 efx_filter_update_rx_scatter(efx);
642 673
643 /* We must keep at least one descriptor in a TX ring empty. 674 /* We must keep at least one descriptor in a TX ring empty.
644 * We could avoid this when the queue size does not exactly 675 * We could avoid this when the queue size does not exactly
@@ -655,16 +686,12 @@ static void efx_start_datapath(struct efx_nic *efx)
655 efx_for_each_channel_tx_queue(tx_queue, channel) 686 efx_for_each_channel_tx_queue(tx_queue, channel)
656 efx_init_tx_queue(tx_queue); 687 efx_init_tx_queue(tx_queue);
657 688
658 /* The rx buffer allocation strategy is MTU dependent */
659 efx_rx_strategy(channel);
660
661 efx_for_each_channel_rx_queue(rx_queue, channel) { 689 efx_for_each_channel_rx_queue(rx_queue, channel) {
662 efx_init_rx_queue(rx_queue); 690 efx_init_rx_queue(rx_queue);
663 efx_nic_generate_fill_event(rx_queue); 691 efx_nic_generate_fill_event(rx_queue);
664 } 692 }
665 693
666 WARN_ON(channel->rx_pkt != NULL); 694 WARN_ON(channel->rx_pkt_n_frags);
667 efx_rx_strategy(channel);
668 } 695 }
669 696
670 if (netif_device_present(efx->net_dev)) 697 if (netif_device_present(efx->net_dev))
@@ -683,7 +710,7 @@ static void efx_stop_datapath(struct efx_nic *efx)
683 BUG_ON(efx->port_enabled); 710 BUG_ON(efx->port_enabled);
684 711
685 /* Only perform flush if dma is enabled */ 712 /* Only perform flush if dma is enabled */
686 if (dev->is_busmaster) { 713 if (dev->is_busmaster && efx->state != STATE_RECOVERY) {
687 rc = efx_nic_flush_queues(efx); 714 rc = efx_nic_flush_queues(efx);
688 715
689 if (rc && EFX_WORKAROUND_7803(efx)) { 716 if (rc && EFX_WORKAROUND_7803(efx)) {
@@ -1596,13 +1623,15 @@ static void efx_start_all(struct efx_nic *efx)
1596 efx_start_port(efx); 1623 efx_start_port(efx);
1597 efx_start_datapath(efx); 1624 efx_start_datapath(efx);
1598 1625
1599 /* Start the hardware monitor if there is one. Otherwise (we're link 1626 /* Start the hardware monitor if there is one */
1600 * event driven), we have to poll the PHY because after an event queue 1627 if (efx->type->monitor != NULL)
1601 * flush, we could have a missed a link state change */
1602 if (efx->type->monitor != NULL) {
1603 queue_delayed_work(efx->workqueue, &efx->monitor_work, 1628 queue_delayed_work(efx->workqueue, &efx->monitor_work,
1604 efx_monitor_interval); 1629 efx_monitor_interval);
1605 } else { 1630
1631 /* If link state detection is normally event-driven, we have
1632 * to poll now because we could have missed a change
1633 */
1634 if (efx_nic_rev(efx) >= EFX_REV_SIENA_A0) {
1606 mutex_lock(&efx->mac_lock); 1635 mutex_lock(&efx->mac_lock);
1607 if (efx->phy_op->poll(efx)) 1636 if (efx->phy_op->poll(efx))
1608 efx_link_status_changed(efx); 1637 efx_link_status_changed(efx);
@@ -2309,7 +2338,9 @@ int efx_reset(struct efx_nic *efx, enum reset_type method)
2309 2338
2310out: 2339out:
2311 /* Leave device stopped if necessary */ 2340 /* Leave device stopped if necessary */
2312 disabled = rc || method == RESET_TYPE_DISABLE; 2341 disabled = rc ||
2342 method == RESET_TYPE_DISABLE ||
2343 method == RESET_TYPE_RECOVER_OR_DISABLE;
2313 rc2 = efx_reset_up(efx, method, !disabled); 2344 rc2 = efx_reset_up(efx, method, !disabled);
2314 if (rc2) { 2345 if (rc2) {
2315 disabled = true; 2346 disabled = true;
@@ -2328,13 +2359,48 @@ out:
2328 return rc; 2359 return rc;
2329} 2360}
2330 2361
2362/* Try recovery mechanisms.
2363 * For now only EEH is supported.
2364 * Returns 0 if the recovery mechanisms are unsuccessful.
2365 * Returns a non-zero value otherwise.
2366 */
2367static int efx_try_recovery(struct efx_nic *efx)
2368{
2369#ifdef CONFIG_EEH
2370 /* A PCI error can occur and not be seen by EEH because nothing
2371 * happens on the PCI bus. In this case the driver may fail and
2372 * schedule a 'recover or reset', leading to this recovery handler.
2373 * Manually call the eeh failure check function.
2374 */
2375 struct eeh_dev *eehdev =
2376 of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev));
2377
2378 if (eeh_dev_check_failure(eehdev)) {
2379 /* The EEH mechanisms will handle the error and reset the
2380 * device if necessary.
2381 */
2382 return 1;
2383 }
2384#endif
2385 return 0;
2386}
2387
2331/* The worker thread exists so that code that cannot sleep can 2388/* The worker thread exists so that code that cannot sleep can
2332 * schedule a reset for later. 2389 * schedule a reset for later.
2333 */ 2390 */
2334static void efx_reset_work(struct work_struct *data) 2391static void efx_reset_work(struct work_struct *data)
2335{ 2392{
2336 struct efx_nic *efx = container_of(data, struct efx_nic, reset_work); 2393 struct efx_nic *efx = container_of(data, struct efx_nic, reset_work);
2337 unsigned long pending = ACCESS_ONCE(efx->reset_pending); 2394 unsigned long pending;
2395 enum reset_type method;
2396
2397 pending = ACCESS_ONCE(efx->reset_pending);
2398 method = fls(pending) - 1;
2399
2400 if ((method == RESET_TYPE_RECOVER_OR_DISABLE ||
2401 method == RESET_TYPE_RECOVER_OR_ALL) &&
2402 efx_try_recovery(efx))
2403 return;
2338 2404
2339 if (!pending) 2405 if (!pending)
2340 return; 2406 return;
@@ -2346,7 +2412,7 @@ static void efx_reset_work(struct work_struct *data)
2346 * it cannot change again. 2412 * it cannot change again.
2347 */ 2413 */
2348 if (efx->state == STATE_READY) 2414 if (efx->state == STATE_READY)
2349 (void)efx_reset(efx, fls(pending) - 1); 2415 (void)efx_reset(efx, method);
2350 2416
2351 rtnl_unlock(); 2417 rtnl_unlock();
2352} 2418}
@@ -2355,11 +2421,20 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)
2355{ 2421{
2356 enum reset_type method; 2422 enum reset_type method;
2357 2423
2424 if (efx->state == STATE_RECOVERY) {
2425 netif_dbg(efx, drv, efx->net_dev,
2426 "recovering: skip scheduling %s reset\n",
2427 RESET_TYPE(type));
2428 return;
2429 }
2430
2358 switch (type) { 2431 switch (type) {
2359 case RESET_TYPE_INVISIBLE: 2432 case RESET_TYPE_INVISIBLE:
2360 case RESET_TYPE_ALL: 2433 case RESET_TYPE_ALL:
2434 case RESET_TYPE_RECOVER_OR_ALL:
2361 case RESET_TYPE_WORLD: 2435 case RESET_TYPE_WORLD:
2362 case RESET_TYPE_DISABLE: 2436 case RESET_TYPE_DISABLE:
2437 case RESET_TYPE_RECOVER_OR_DISABLE:
2363 method = type; 2438 method = type;
2364 netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n", 2439 netif_dbg(efx, drv, efx->net_dev, "scheduling %s reset\n",
2365 RESET_TYPE(method)); 2440 RESET_TYPE(method));
@@ -2569,6 +2644,8 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
2569 efx_fini_struct(efx); 2644 efx_fini_struct(efx);
2570 pci_set_drvdata(pci_dev, NULL); 2645 pci_set_drvdata(pci_dev, NULL);
2571 free_netdev(efx->net_dev); 2646 free_netdev(efx->net_dev);
2647
2648 pci_disable_pcie_error_reporting(pci_dev);
2572}; 2649};
2573 2650
2574/* NIC VPD information 2651/* NIC VPD information
@@ -2741,6 +2818,11 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
2741 netif_warn(efx, probe, efx->net_dev, 2818 netif_warn(efx, probe, efx->net_dev,
2742 "failed to create MTDs (%d)\n", rc); 2819 "failed to create MTDs (%d)\n", rc);
2743 2820
2821 rc = pci_enable_pcie_error_reporting(pci_dev);
2822 if (rc && rc != -EINVAL)
2823 netif_warn(efx, probe, efx->net_dev,
2824 "pci_enable_pcie_error_reporting failed (%d)\n", rc);
2825
2744 return 0; 2826 return 0;
2745 2827
2746 fail4: 2828 fail4:
@@ -2865,12 +2947,112 @@ static const struct dev_pm_ops efx_pm_ops = {
2865 .restore = efx_pm_resume, 2947 .restore = efx_pm_resume,
2866}; 2948};
2867 2949
2950/* A PCI error affecting this device was detected.
2951 * At this point MMIO and DMA may be disabled.
2952 * Stop the software path and request a slot reset.
2953 */
2954pci_ers_result_t efx_io_error_detected(struct pci_dev *pdev,
2955 enum pci_channel_state state)
2956{
2957 pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
2958 struct efx_nic *efx = pci_get_drvdata(pdev);
2959
2960 if (state == pci_channel_io_perm_failure)
2961 return PCI_ERS_RESULT_DISCONNECT;
2962
2963 rtnl_lock();
2964
2965 if (efx->state != STATE_DISABLED) {
2966 efx->state = STATE_RECOVERY;
2967 efx->reset_pending = 0;
2968
2969 efx_device_detach_sync(efx);
2970
2971 efx_stop_all(efx);
2972 efx_stop_interrupts(efx, false);
2973
2974 status = PCI_ERS_RESULT_NEED_RESET;
2975 } else {
2976 /* If the interface is disabled we don't want to do anything
2977 * with it.
2978 */
2979 status = PCI_ERS_RESULT_RECOVERED;
2980 }
2981
2982 rtnl_unlock();
2983
2984 pci_disable_device(pdev);
2985
2986 return status;
2987}
2988
2989/* Fake a successfull reset, which will be performed later in efx_io_resume. */
2990pci_ers_result_t efx_io_slot_reset(struct pci_dev *pdev)
2991{
2992 struct efx_nic *efx = pci_get_drvdata(pdev);
2993 pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
2994 int rc;
2995
2996 if (pci_enable_device(pdev)) {
2997 netif_err(efx, hw, efx->net_dev,
2998 "Cannot re-enable PCI device after reset.\n");
2999 status = PCI_ERS_RESULT_DISCONNECT;
3000 }
3001
3002 rc = pci_cleanup_aer_uncorrect_error_status(pdev);
3003 if (rc) {
3004 netif_err(efx, hw, efx->net_dev,
3005 "pci_cleanup_aer_uncorrect_error_status failed (%d)\n", rc);
3006 /* Non-fatal error. Continue. */
3007 }
3008
3009 return status;
3010}
3011
3012/* Perform the actual reset and resume I/O operations. */
3013static void efx_io_resume(struct pci_dev *pdev)
3014{
3015 struct efx_nic *efx = pci_get_drvdata(pdev);
3016 int rc;
3017
3018 rtnl_lock();
3019
3020 if (efx->state == STATE_DISABLED)
3021 goto out;
3022
3023 rc = efx_reset(efx, RESET_TYPE_ALL);
3024 if (rc) {
3025 netif_err(efx, hw, efx->net_dev,
3026 "efx_reset failed after PCI error (%d)\n", rc);
3027 } else {
3028 efx->state = STATE_READY;
3029 netif_dbg(efx, hw, efx->net_dev,
3030 "Done resetting and resuming IO after PCI error.\n");
3031 }
3032
3033out:
3034 rtnl_unlock();
3035}
3036
3037/* For simplicity and reliability, we always require a slot reset and try to
3038 * reset the hardware when a pci error affecting the device is detected.
3039 * We leave both the link_reset and mmio_enabled callback unimplemented:
3040 * with our request for slot reset the mmio_enabled callback will never be
3041 * called, and the link_reset callback is not used by AER or EEH mechanisms.
3042 */
3043static struct pci_error_handlers efx_err_handlers = {
3044 .error_detected = efx_io_error_detected,
3045 .slot_reset = efx_io_slot_reset,
3046 .resume = efx_io_resume,
3047};
3048
2868static struct pci_driver efx_pci_driver = { 3049static struct pci_driver efx_pci_driver = {
2869 .name = KBUILD_MODNAME, 3050 .name = KBUILD_MODNAME,
2870 .id_table = efx_pci_table, 3051 .id_table = efx_pci_table,
2871 .probe = efx_pci_probe, 3052 .probe = efx_pci_probe,
2872 .remove = efx_pci_remove, 3053 .remove = efx_pci_remove,
2873 .driver.pm = &efx_pm_ops, 3054 .driver.pm = &efx_pm_ops,
3055 .err_handler = &efx_err_handlers,
2874}; 3056};
2875 3057
2876/************************************************************************** 3058/**************************************************************************
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 50247dfe8f57..8372da239b43 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -33,17 +33,22 @@ extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
33extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx); 33extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
34 34
35/* RX */ 35/* RX */
36extern void efx_rx_config_page_split(struct efx_nic *efx);
36extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue); 37extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
37extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue); 38extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue);
38extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue); 39extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue);
39extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue); 40extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue);
40extern void efx_rx_strategy(struct efx_channel *channel);
41extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue); 41extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue);
42extern void efx_rx_slow_fill(unsigned long context); 42extern void efx_rx_slow_fill(unsigned long context);
43extern void __efx_rx_packet(struct efx_channel *channel, 43extern void __efx_rx_packet(struct efx_channel *channel);
44 struct efx_rx_buffer *rx_buf); 44extern void efx_rx_packet(struct efx_rx_queue *rx_queue,
45extern void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, 45 unsigned int index, unsigned int n_frags,
46 unsigned int len, u16 flags); 46 unsigned int len, u16 flags);
47static inline void efx_rx_flush_packet(struct efx_channel *channel)
48{
49 if (channel->rx_pkt_n_frags)
50 __efx_rx_packet(channel);
51}
47extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue); 52extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
48 53
49#define EFX_MAX_DMAQ_SIZE 4096UL 54#define EFX_MAX_DMAQ_SIZE 4096UL
@@ -67,6 +72,7 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
67extern int efx_probe_filters(struct efx_nic *efx); 72extern int efx_probe_filters(struct efx_nic *efx);
68extern void efx_restore_filters(struct efx_nic *efx); 73extern void efx_restore_filters(struct efx_nic *efx);
69extern void efx_remove_filters(struct efx_nic *efx); 74extern void efx_remove_filters(struct efx_nic *efx);
75extern void efx_filter_update_rx_scatter(struct efx_nic *efx);
70extern s32 efx_filter_insert_filter(struct efx_nic *efx, 76extern s32 efx_filter_insert_filter(struct efx_nic *efx,
71 struct efx_filter_spec *spec, 77 struct efx_filter_spec *spec,
72 bool replace); 78 bool replace);
@@ -171,9 +177,9 @@ static inline void efx_device_detach_sync(struct efx_nic *efx)
171 * TX scheduler is stopped when we're done and before 177 * TX scheduler is stopped when we're done and before
172 * netif_device_present() becomes false. 178 * netif_device_present() becomes false.
173 */ 179 */
174 netif_tx_lock(dev); 180 netif_tx_lock_bh(dev);
175 netif_device_detach(dev); 181 netif_device_detach(dev);
176 netif_tx_unlock(dev); 182 netif_tx_unlock_bh(dev);
177} 183}
178 184
179#endif /* EFX_EFX_H */ 185#endif /* EFX_EFX_H */
diff --git a/drivers/net/ethernet/sfc/enum.h b/drivers/net/ethernet/sfc/enum.h
index 182dbe2cc6e4..ab8fb5889e55 100644
--- a/drivers/net/ethernet/sfc/enum.h
+++ b/drivers/net/ethernet/sfc/enum.h
@@ -137,8 +137,12 @@ enum efx_loopback_mode {
137 * Reset methods are numbered in order of increasing scope. 137 * Reset methods are numbered in order of increasing scope.
138 * 138 *
139 * @RESET_TYPE_INVISIBLE: Reset datapath and MAC (Falcon only) 139 * @RESET_TYPE_INVISIBLE: Reset datapath and MAC (Falcon only)
140 * @RESET_TYPE_RECOVER_OR_ALL: Try to recover. Apply RESET_TYPE_ALL
141 * if unsuccessful.
140 * @RESET_TYPE_ALL: Reset datapath, MAC and PHY 142 * @RESET_TYPE_ALL: Reset datapath, MAC and PHY
141 * @RESET_TYPE_WORLD: Reset as much as possible 143 * @RESET_TYPE_WORLD: Reset as much as possible
144 * @RESET_TYPE_RECOVER_OR_DISABLE: Try to recover. Apply RESET_TYPE_DISABLE if
145 * unsuccessful.
142 * @RESET_TYPE_DISABLE: Reset datapath, MAC and PHY; leave NIC disabled 146 * @RESET_TYPE_DISABLE: Reset datapath, MAC and PHY; leave NIC disabled
143 * @RESET_TYPE_TX_WATCHDOG: reset due to TX watchdog 147 * @RESET_TYPE_TX_WATCHDOG: reset due to TX watchdog
144 * @RESET_TYPE_INT_ERROR: reset due to internal error 148 * @RESET_TYPE_INT_ERROR: reset due to internal error
@@ -150,9 +154,11 @@ enum efx_loopback_mode {
150 */ 154 */
151enum reset_type { 155enum reset_type {
152 RESET_TYPE_INVISIBLE = 0, 156 RESET_TYPE_INVISIBLE = 0,
153 RESET_TYPE_ALL = 1, 157 RESET_TYPE_RECOVER_OR_ALL = 1,
154 RESET_TYPE_WORLD = 2, 158 RESET_TYPE_ALL = 2,
155 RESET_TYPE_DISABLE = 3, 159 RESET_TYPE_WORLD = 3,
160 RESET_TYPE_RECOVER_OR_DISABLE = 4,
161 RESET_TYPE_DISABLE = 5,
156 RESET_TYPE_MAX_METHOD, 162 RESET_TYPE_MAX_METHOD,
157 RESET_TYPE_TX_WATCHDOG, 163 RESET_TYPE_TX_WATCHDOG,
158 RESET_TYPE_INT_ERROR, 164 RESET_TYPE_INT_ERROR,
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 8e61cd06f66a..6e768175e7e0 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -154,6 +154,7 @@ static const struct efx_ethtool_stat efx_ethtool_stats[] = {
154 EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err), 154 EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tcp_udp_chksum_err),
155 EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch), 155 EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_mcast_mismatch),
156 EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc), 156 EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_frm_trunc),
157 EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_nodesc_trunc),
157}; 158};
158 159
159/* Number of ethtool statistics */ 160/* Number of ethtool statistics */
@@ -978,7 +979,8 @@ static int efx_ethtool_set_class_rule(struct efx_nic *efx,
978 rule->m_ext.data[1])) 979 rule->m_ext.data[1]))
979 return -EINVAL; 980 return -EINVAL;
980 981
981 efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL, 0, 982 efx_filter_init_rx(&spec, EFX_FILTER_PRI_MANUAL,
983 efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
982 (rule->ring_cookie == RX_CLS_FLOW_DISC) ? 984 (rule->ring_cookie == RX_CLS_FLOW_DISC) ?
983 0xfff : rule->ring_cookie); 985 0xfff : rule->ring_cookie);
984 986
diff --git a/drivers/net/ethernet/sfc/falcon.c b/drivers/net/ethernet/sfc/falcon.c
index 49bcd196e10d..4486102fa9b3 100644
--- a/drivers/net/ethernet/sfc/falcon.c
+++ b/drivers/net/ethernet/sfc/falcon.c
@@ -1546,10 +1546,6 @@ static int falcon_probe_nic(struct efx_nic *efx)
1546 1546
1547static void falcon_init_rx_cfg(struct efx_nic *efx) 1547static void falcon_init_rx_cfg(struct efx_nic *efx)
1548{ 1548{
1549 /* Prior to Siena the RX DMA engine will split each frame at
1550 * intervals of RX_USR_BUF_SIZE (32-byte units). We set it to
1551 * be so large that that never happens. */
1552 const unsigned huge_buf_size = (3 * 4096) >> 5;
1553 /* RX control FIFO thresholds (32 entries) */ 1549 /* RX control FIFO thresholds (32 entries) */
1554 const unsigned ctrl_xon_thr = 20; 1550 const unsigned ctrl_xon_thr = 20;
1555 const unsigned ctrl_xoff_thr = 25; 1551 const unsigned ctrl_xoff_thr = 25;
@@ -1557,10 +1553,15 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)
1557 1553
1558 efx_reado(efx, &reg, FR_AZ_RX_CFG); 1554 efx_reado(efx, &reg, FR_AZ_RX_CFG);
1559 if (efx_nic_rev(efx) <= EFX_REV_FALCON_A1) { 1555 if (efx_nic_rev(efx) <= EFX_REV_FALCON_A1) {
1560 /* Data FIFO size is 5.5K */ 1556 /* Data FIFO size is 5.5K. The RX DMA engine only
1557 * supports scattering for user-mode queues, but will
1558 * split DMA writes at intervals of RX_USR_BUF_SIZE
1559 * (32-byte units) even for kernel-mode queues. We
1560 * set it to be so large that that never happens.
1561 */
1561 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_DESC_PUSH_EN, 0); 1562 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_DESC_PUSH_EN, 0);
1562 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_USR_BUF_SIZE, 1563 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_USR_BUF_SIZE,
1563 huge_buf_size); 1564 (3 * 4096) >> 5);
1564 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_MAC_TH, 512 >> 8); 1565 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_MAC_TH, 512 >> 8);
1565 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XOFF_MAC_TH, 2048 >> 8); 1566 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XOFF_MAC_TH, 2048 >> 8);
1566 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_TX_TH, ctrl_xon_thr); 1567 EFX_SET_OWORD_FIELD(reg, FRF_AA_RX_XON_TX_TH, ctrl_xon_thr);
@@ -1569,7 +1570,7 @@ static void falcon_init_rx_cfg(struct efx_nic *efx)
1569 /* Data FIFO size is 80K; register fields moved */ 1570 /* Data FIFO size is 80K; register fields moved */
1570 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_DESC_PUSH_EN, 0); 1571 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_DESC_PUSH_EN, 0);
1571 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_USR_BUF_SIZE, 1572 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_USR_BUF_SIZE,
1572 huge_buf_size); 1573 EFX_RX_USR_BUF_SIZE >> 5);
1573 /* Send XON and XOFF at ~3 * max MTU away from empty/full */ 1574 /* Send XON and XOFF at ~3 * max MTU away from empty/full */
1574 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XON_MAC_TH, 27648 >> 8); 1575 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XON_MAC_TH, 27648 >> 8);
1575 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XOFF_MAC_TH, 54272 >> 8); 1576 EFX_SET_OWORD_FIELD(reg, FRF_BZ_RX_XOFF_MAC_TH, 54272 >> 8);
@@ -1815,6 +1816,7 @@ const struct efx_nic_type falcon_a1_nic_type = {
1815 .evq_rptr_tbl_base = FR_AA_EVQ_RPTR_KER, 1816 .evq_rptr_tbl_base = FR_AA_EVQ_RPTR_KER,
1816 .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH), 1817 .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
1817 .rx_buffer_padding = 0x24, 1818 .rx_buffer_padding = 0x24,
1819 .can_rx_scatter = false,
1818 .max_interrupt_mode = EFX_INT_MODE_MSI, 1820 .max_interrupt_mode = EFX_INT_MODE_MSI,
1819 .phys_addr_channels = 4, 1821 .phys_addr_channels = 4,
1820 .timer_period_max = 1 << FRF_AB_TC_TIMER_VAL_WIDTH, 1822 .timer_period_max = 1 << FRF_AB_TC_TIMER_VAL_WIDTH,
@@ -1865,6 +1867,7 @@ const struct efx_nic_type falcon_b0_nic_type = {
1865 .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH), 1867 .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
1866 .rx_buffer_hash_size = 0x10, 1868 .rx_buffer_hash_size = 0x10,
1867 .rx_buffer_padding = 0, 1869 .rx_buffer_padding = 0,
1870 .can_rx_scatter = true,
1868 .max_interrupt_mode = EFX_INT_MODE_MSIX, 1871 .max_interrupt_mode = EFX_INT_MODE_MSIX,
1869 .phys_addr_channels = 32, /* Hardware limit is 64, but the legacy 1872 .phys_addr_channels = 32, /* Hardware limit is 64, but the legacy
1870 * interrupt handler only supports 32 1873 * interrupt handler only supports 32
diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c
index 8af42cd1feda..2397f0e8d3eb 100644
--- a/drivers/net/ethernet/sfc/filter.c
+++ b/drivers/net/ethernet/sfc/filter.c
@@ -66,6 +66,10 @@ struct efx_filter_state {
66#endif 66#endif
67}; 67};
68 68
69static void efx_filter_table_clear_entry(struct efx_nic *efx,
70 struct efx_filter_table *table,
71 unsigned int filter_idx);
72
69/* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit 73/* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit
70 * key derived from the n-tuple. The initial LFSR state is 0xffff. */ 74 * key derived from the n-tuple. The initial LFSR state is 0xffff. */
71static u16 efx_filter_hash(u32 key) 75static u16 efx_filter_hash(u32 key)
@@ -168,6 +172,25 @@ static void efx_filter_push_rx_config(struct efx_nic *efx)
168 filter_ctl, FRF_CZ_MULTICAST_NOMATCH_RSS_ENABLED, 172 filter_ctl, FRF_CZ_MULTICAST_NOMATCH_RSS_ENABLED,
169 !!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags & 173 !!(table->spec[EFX_FILTER_INDEX_MC_DEF].flags &
170 EFX_FILTER_FLAG_RX_RSS)); 174 EFX_FILTER_FLAG_RX_RSS));
175
176 /* There is a single bit to enable RX scatter for all
177 * unmatched packets. Only set it if scatter is
178 * enabled in both filter specs.
179 */
180 EFX_SET_OWORD_FIELD(
181 filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q,
182 !!(table->spec[EFX_FILTER_INDEX_UC_DEF].flags &
183 table->spec[EFX_FILTER_INDEX_MC_DEF].flags &
184 EFX_FILTER_FLAG_RX_SCATTER));
185 } else if (efx_nic_rev(efx) >= EFX_REV_FALCON_B0) {
186 /* We don't expose 'default' filters because unmatched
187 * packets always go to the queue number found in the
188 * RSS table. But we still need to set the RX scatter
189 * bit here.
190 */
191 EFX_SET_OWORD_FIELD(
192 filter_ctl, FRF_BZ_SCATTER_ENBL_NO_MATCH_Q,
193 efx->rx_scatter);
171 } 194 }
172 195
173 efx_writeo(efx, &filter_ctl, FR_BZ_RX_FILTER_CTL); 196 efx_writeo(efx, &filter_ctl, FR_BZ_RX_FILTER_CTL);
@@ -409,9 +432,18 @@ static void efx_filter_reset_rx_def(struct efx_nic *efx, unsigned filter_idx)
409 struct efx_filter_state *state = efx->filter_state; 432 struct efx_filter_state *state = efx->filter_state;
410 struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_DEF]; 433 struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_DEF];
411 struct efx_filter_spec *spec = &table->spec[filter_idx]; 434 struct efx_filter_spec *spec = &table->spec[filter_idx];
435 enum efx_filter_flags flags = 0;
436
437 /* If there's only one channel then disable RSS for non VF
438 * traffic, thereby allowing VFs to use RSS when the PF can't.
439 */
440 if (efx->n_rx_channels > 1)
441 flags |= EFX_FILTER_FLAG_RX_RSS;
412 442
413 efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL, 443 if (efx->rx_scatter)
414 EFX_FILTER_FLAG_RX_RSS, 0); 444 flags |= EFX_FILTER_FLAG_RX_SCATTER;
445
446 efx_filter_init_rx(spec, EFX_FILTER_PRI_MANUAL, flags, 0);
415 spec->type = EFX_FILTER_UC_DEF + filter_idx; 447 spec->type = EFX_FILTER_UC_DEF + filter_idx;
416 table->used_bitmap[0] |= 1 << filter_idx; 448 table->used_bitmap[0] |= 1 << filter_idx;
417} 449}
@@ -463,13 +495,6 @@ static u32 efx_filter_build(efx_oword_t *filter, struct efx_filter_spec *spec)
463 break; 495 break;
464 } 496 }
465 497
466 case EFX_FILTER_TABLE_RX_DEF:
467 /* One filter spec per type */
468 BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0);
469 BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF !=
470 EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF);
471 return spec->type - EFX_FILTER_UC_DEF;
472
473 case EFX_FILTER_TABLE_RX_MAC: { 498 case EFX_FILTER_TABLE_RX_MAC: {
474 bool is_wild = spec->type == EFX_FILTER_MAC_WILD; 499 bool is_wild = spec->type == EFX_FILTER_MAC_WILD;
475 EFX_POPULATE_OWORD_7( 500 EFX_POPULATE_OWORD_7(
@@ -520,42 +545,6 @@ static bool efx_filter_equal(const struct efx_filter_spec *left,
520 return true; 545 return true;
521} 546}
522 547
523static int efx_filter_search(struct efx_filter_table *table,
524 struct efx_filter_spec *spec, u32 key,
525 bool for_insert, unsigned int *depth_required)
526{
527 unsigned hash, incr, filter_idx, depth, depth_max;
528
529 hash = efx_filter_hash(key);
530 incr = efx_filter_increment(key);
531
532 filter_idx = hash & (table->size - 1);
533 depth = 1;
534 depth_max = (for_insert ?
535 (spec->priority <= EFX_FILTER_PRI_HINT ?
536 FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX) :
537 table->search_depth[spec->type]);
538
539 for (;;) {
540 /* Return success if entry is used and matches this spec
541 * or entry is unused and we are trying to insert.
542 */
543 if (test_bit(filter_idx, table->used_bitmap) ?
544 efx_filter_equal(spec, &table->spec[filter_idx]) :
545 for_insert) {
546 *depth_required = depth;
547 return filter_idx;
548 }
549
550 /* Return failure if we reached the maximum search depth */
551 if (depth == depth_max)
552 return for_insert ? -EBUSY : -ENOENT;
553
554 filter_idx = (filter_idx + incr) & (table->size - 1);
555 ++depth;
556 }
557}
558
559/* 548/*
560 * Construct/deconstruct external filter IDs. At least the RX filter 549 * Construct/deconstruct external filter IDs. At least the RX filter
561 * IDs must be ordered by matching priority, for RX NFC semantics. 550 * IDs must be ordered by matching priority, for RX NFC semantics.
@@ -650,44 +639,111 @@ u32 efx_filter_get_rx_id_limit(struct efx_nic *efx)
650 * efx_filter_insert_filter - add or replace a filter 639 * efx_filter_insert_filter - add or replace a filter
651 * @efx: NIC in which to insert the filter 640 * @efx: NIC in which to insert the filter
652 * @spec: Specification for the filter 641 * @spec: Specification for the filter
653 * @replace: Flag for whether the specified filter may replace a filter 642 * @replace_equal: Flag for whether the specified filter may replace an
654 * with an identical match expression and equal or lower priority 643 * existing filter with equal priority
655 * 644 *
656 * On success, return the filter ID. 645 * On success, return the filter ID.
657 * On failure, return a negative error code. 646 * On failure, return a negative error code.
647 *
648 * If an existing filter has equal match values to the new filter
649 * spec, then the new filter might replace it, depending on the
650 * relative priorities. If the existing filter has lower priority, or
651 * if @replace_equal is set and it has equal priority, then it is
652 * replaced. Otherwise the function fails, returning -%EPERM if
653 * the existing filter has higher priority or -%EEXIST if it has
654 * equal priority.
658 */ 655 */
659s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec, 656s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
660 bool replace) 657 bool replace_equal)
661{ 658{
662 struct efx_filter_state *state = efx->filter_state; 659 struct efx_filter_state *state = efx->filter_state;
663 struct efx_filter_table *table = efx_filter_spec_table(state, spec); 660 struct efx_filter_table *table = efx_filter_spec_table(state, spec);
664 struct efx_filter_spec *saved_spec;
665 efx_oword_t filter; 661 efx_oword_t filter;
666 unsigned int filter_idx, depth = 0; 662 int rep_index, ins_index;
667 u32 key; 663 unsigned int depth = 0;
668 int rc; 664 int rc;
669 665
670 if (!table || table->size == 0) 666 if (!table || table->size == 0)
671 return -EINVAL; 667 return -EINVAL;
672 668
673 key = efx_filter_build(&filter, spec);
674
675 netif_vdbg(efx, hw, efx->net_dev, 669 netif_vdbg(efx, hw, efx->net_dev,
676 "%s: type %d search_depth=%d", __func__, spec->type, 670 "%s: type %d search_depth=%d", __func__, spec->type,
677 table->search_depth[spec->type]); 671 table->search_depth[spec->type]);
678 672
679 spin_lock_bh(&state->lock); 673 if (table->id == EFX_FILTER_TABLE_RX_DEF) {
674 /* One filter spec per type */
675 BUILD_BUG_ON(EFX_FILTER_INDEX_UC_DEF != 0);
676 BUILD_BUG_ON(EFX_FILTER_INDEX_MC_DEF !=
677 EFX_FILTER_MC_DEF - EFX_FILTER_UC_DEF);
678 rep_index = spec->type - EFX_FILTER_INDEX_UC_DEF;
679 ins_index = rep_index;
680 680
681 rc = efx_filter_search(table, spec, key, true, &depth); 681 spin_lock_bh(&state->lock);
682 if (rc < 0) 682 } else {
683 goto out; 683 /* Search concurrently for
684 filter_idx = rc; 684 * (1) a filter to be replaced (rep_index): any filter
685 BUG_ON(filter_idx >= table->size); 685 * with the same match values, up to the current
686 saved_spec = &table->spec[filter_idx]; 686 * search depth for this type, and
687 687 * (2) the insertion point (ins_index): (1) or any
688 if (test_bit(filter_idx, table->used_bitmap)) { 688 * free slot before it or up to the maximum search
689 /* Should we replace the existing filter? */ 689 * depth for this priority
690 if (!replace) { 690 * We fail if we cannot find (2).
691 *
692 * We can stop once either
693 * (a) we find (1), in which case we have definitely
694 * found (2) as well; or
695 * (b) we have searched exhaustively for (1), and have
696 * either found (2) or searched exhaustively for it
697 */
698 u32 key = efx_filter_build(&filter, spec);
699 unsigned int hash = efx_filter_hash(key);
700 unsigned int incr = efx_filter_increment(key);
701 unsigned int max_rep_depth = table->search_depth[spec->type];
702 unsigned int max_ins_depth =
703 spec->priority <= EFX_FILTER_PRI_HINT ?
704 FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX;
705 unsigned int i = hash & (table->size - 1);
706
707 ins_index = -1;
708 depth = 1;
709
710 spin_lock_bh(&state->lock);
711
712 for (;;) {
713 if (!test_bit(i, table->used_bitmap)) {
714 if (ins_index < 0)
715 ins_index = i;
716 } else if (efx_filter_equal(spec, &table->spec[i])) {
717 /* Case (a) */
718 if (ins_index < 0)
719 ins_index = i;
720 rep_index = i;
721 break;
722 }
723
724 if (depth >= max_rep_depth &&
725 (ins_index >= 0 || depth >= max_ins_depth)) {
726 /* Case (b) */
727 if (ins_index < 0) {
728 rc = -EBUSY;
729 goto out;
730 }
731 rep_index = -1;
732 break;
733 }
734
735 i = (i + incr) & (table->size - 1);
736 ++depth;
737 }
738 }
739
740 /* If we found a filter to be replaced, check whether we
741 * should do so
742 */
743 if (rep_index >= 0) {
744 struct efx_filter_spec *saved_spec = &table->spec[rep_index];
745
746 if (spec->priority == saved_spec->priority && !replace_equal) {
691 rc = -EEXIST; 747 rc = -EEXIST;
692 goto out; 748 goto out;
693 } 749 }
@@ -695,11 +751,14 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
695 rc = -EPERM; 751 rc = -EPERM;
696 goto out; 752 goto out;
697 } 753 }
698 } else { 754 }
699 __set_bit(filter_idx, table->used_bitmap); 755
756 /* Insert the filter */
757 if (ins_index != rep_index) {
758 __set_bit(ins_index, table->used_bitmap);
700 ++table->used; 759 ++table->used;
701 } 760 }
702 *saved_spec = *spec; 761 table->spec[ins_index] = *spec;
703 762
704 if (table->id == EFX_FILTER_TABLE_RX_DEF) { 763 if (table->id == EFX_FILTER_TABLE_RX_DEF) {
705 efx_filter_push_rx_config(efx); 764 efx_filter_push_rx_config(efx);
@@ -713,13 +772,19 @@ s32 efx_filter_insert_filter(struct efx_nic *efx, struct efx_filter_spec *spec,
713 } 772 }
714 773
715 efx_writeo(efx, &filter, 774 efx_writeo(efx, &filter,
716 table->offset + table->step * filter_idx); 775 table->offset + table->step * ins_index);
776
777 /* If we were able to replace a filter by inserting
778 * at a lower depth, clear the replaced filter
779 */
780 if (ins_index != rep_index && rep_index >= 0)
781 efx_filter_table_clear_entry(efx, table, rep_index);
717 } 782 }
718 783
719 netif_vdbg(efx, hw, efx->net_dev, 784 netif_vdbg(efx, hw, efx->net_dev,
720 "%s: filter type %d index %d rxq %u set", 785 "%s: filter type %d index %d rxq %u set",
721 __func__, spec->type, filter_idx, spec->dmaq_id); 786 __func__, spec->type, ins_index, spec->dmaq_id);
722 rc = efx_filter_make_id(spec, filter_idx); 787 rc = efx_filter_make_id(spec, ins_index);
723 788
724out: 789out:
725 spin_unlock_bh(&state->lock); 790 spin_unlock_bh(&state->lock);
@@ -1060,6 +1125,50 @@ void efx_remove_filters(struct efx_nic *efx)
1060 kfree(state); 1125 kfree(state);
1061} 1126}
1062 1127
1128/* Update scatter enable flags for filters pointing to our own RX queues */
1129void efx_filter_update_rx_scatter(struct efx_nic *efx)
1130{
1131 struct efx_filter_state *state = efx->filter_state;
1132 enum efx_filter_table_id table_id;
1133 struct efx_filter_table *table;
1134 efx_oword_t filter;
1135 unsigned int filter_idx;
1136
1137 spin_lock_bh(&state->lock);
1138
1139 for (table_id = EFX_FILTER_TABLE_RX_IP;
1140 table_id <= EFX_FILTER_TABLE_RX_DEF;
1141 table_id++) {
1142 table = &state->table[table_id];
1143
1144 for (filter_idx = 0; filter_idx < table->size; filter_idx++) {
1145 if (!test_bit(filter_idx, table->used_bitmap) ||
1146 table->spec[filter_idx].dmaq_id >=
1147 efx->n_rx_channels)
1148 continue;
1149
1150 if (efx->rx_scatter)
1151 table->spec[filter_idx].flags |=
1152 EFX_FILTER_FLAG_RX_SCATTER;
1153 else
1154 table->spec[filter_idx].flags &=
1155 ~EFX_FILTER_FLAG_RX_SCATTER;
1156
1157 if (table_id == EFX_FILTER_TABLE_RX_DEF)
1158 /* Pushed by efx_filter_push_rx_config() */
1159 continue;
1160
1161 efx_filter_build(&filter, &table->spec[filter_idx]);
1162 efx_writeo(efx, &filter,
1163 table->offset + table->step * filter_idx);
1164 }
1165 }
1166
1167 efx_filter_push_rx_config(efx);
1168
1169 spin_unlock_bh(&state->lock);
1170}
1171
1063#ifdef CONFIG_RFS_ACCEL 1172#ifdef CONFIG_RFS_ACCEL
1064 1173
1065int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, 1174int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h
index 9d426d0457bd..c5c9747861ba 100644
--- a/drivers/net/ethernet/sfc/mcdi_pcol.h
+++ b/drivers/net/ethernet/sfc/mcdi_pcol.h
@@ -553,6 +553,7 @@
553#define MC_CMD_PTP_MODE_V1_VLAN 0x1 /* enum */ 553#define MC_CMD_PTP_MODE_V1_VLAN 0x1 /* enum */
554#define MC_CMD_PTP_MODE_V2 0x2 /* enum */ 554#define MC_CMD_PTP_MODE_V2 0x2 /* enum */
555#define MC_CMD_PTP_MODE_V2_VLAN 0x3 /* enum */ 555#define MC_CMD_PTP_MODE_V2_VLAN 0x3 /* enum */
556#define MC_CMD_PTP_MODE_V2_ENHANCED 0x4 /* enum */
556 557
557/* MC_CMD_PTP_IN_DISABLE msgrequest */ 558/* MC_CMD_PTP_IN_DISABLE msgrequest */
558#define MC_CMD_PTP_IN_DISABLE_LEN 8 559#define MC_CMD_PTP_IN_DISABLE_LEN 8
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 0a90abd2421b..9bd433a095c5 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -69,6 +69,12 @@
69#define EFX_TXQ_TYPES 4 69#define EFX_TXQ_TYPES 4
70#define EFX_MAX_TX_QUEUES (EFX_TXQ_TYPES * EFX_MAX_CHANNELS) 70#define EFX_MAX_TX_QUEUES (EFX_TXQ_TYPES * EFX_MAX_CHANNELS)
71 71
72/* Maximum possible MTU the driver supports */
73#define EFX_MAX_MTU (9 * 1024)
74
75/* Size of an RX scatter buffer. Small enough to pack 2 into a 4K page. */
76#define EFX_RX_USR_BUF_SIZE 1824
77
72/* Forward declare Precision Time Protocol (PTP) support structure. */ 78/* Forward declare Precision Time Protocol (PTP) support structure. */
73struct efx_ptp_data; 79struct efx_ptp_data;
74 80
@@ -206,25 +212,23 @@ struct efx_tx_queue {
206/** 212/**
207 * struct efx_rx_buffer - An Efx RX data buffer 213 * struct efx_rx_buffer - An Efx RX data buffer
208 * @dma_addr: DMA base address of the buffer 214 * @dma_addr: DMA base address of the buffer
209 * @skb: The associated socket buffer. Valid iff !(@flags & %EFX_RX_BUF_PAGE). 215 * @page: The associated page buffer.
210 * Will be %NULL if the buffer slot is currently free.
211 * @page: The associated page buffer. Valif iff @flags & %EFX_RX_BUF_PAGE.
212 * Will be %NULL if the buffer slot is currently free. 216 * Will be %NULL if the buffer slot is currently free.
213 * @page_offset: Offset within page. Valid iff @flags & %EFX_RX_BUF_PAGE. 217 * @page_offset: If pending: offset in @page of DMA base address.
214 * @len: Buffer length, in bytes. 218 * If completed: offset in @page of Ethernet header.
215 * @flags: Flags for buffer and packet state. 219 * @len: If pending: length for DMA descriptor.
220 * If completed: received length, excluding hash prefix.
221 * @flags: Flags for buffer and packet state. These are only set on the
222 * first buffer of a scattered packet.
216 */ 223 */
217struct efx_rx_buffer { 224struct efx_rx_buffer {
218 dma_addr_t dma_addr; 225 dma_addr_t dma_addr;
219 union { 226 struct page *page;
220 struct sk_buff *skb;
221 struct page *page;
222 } u;
223 u16 page_offset; 227 u16 page_offset;
224 u16 len; 228 u16 len;
225 u16 flags; 229 u16 flags;
226}; 230};
227#define EFX_RX_BUF_PAGE 0x0001 231#define EFX_RX_BUF_LAST_IN_PAGE 0x0001
228#define EFX_RX_PKT_CSUMMED 0x0002 232#define EFX_RX_PKT_CSUMMED 0x0002
229#define EFX_RX_PKT_DISCARD 0x0004 233#define EFX_RX_PKT_DISCARD 0x0004
230 234
@@ -260,14 +264,23 @@ struct efx_rx_page_state {
260 * @added_count: Number of buffers added to the receive queue. 264 * @added_count: Number of buffers added to the receive queue.
261 * @notified_count: Number of buffers given to NIC (<= @added_count). 265 * @notified_count: Number of buffers given to NIC (<= @added_count).
262 * @removed_count: Number of buffers removed from the receive queue. 266 * @removed_count: Number of buffers removed from the receive queue.
267 * @scatter_n: Number of buffers used by current packet
268 * @page_ring: The ring to store DMA mapped pages for reuse.
269 * @page_add: Counter to calculate the write pointer for the recycle ring.
270 * @page_remove: Counter to calculate the read pointer for the recycle ring.
271 * @page_recycle_count: The number of pages that have been recycled.
272 * @page_recycle_failed: The number of pages that couldn't be recycled because
273 * the kernel still held a reference to them.
274 * @page_recycle_full: The number of pages that were released because the
275 * recycle ring was full.
276 * @page_ptr_mask: The number of pages in the RX recycle ring minus 1.
263 * @max_fill: RX descriptor maximum fill level (<= ring size) 277 * @max_fill: RX descriptor maximum fill level (<= ring size)
264 * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill 278 * @fast_fill_trigger: RX descriptor fill level that will trigger a fast fill
265 * (<= @max_fill) 279 * (<= @max_fill)
266 * @min_fill: RX descriptor minimum non-zero fill level. 280 * @min_fill: RX descriptor minimum non-zero fill level.
267 * This records the minimum fill level observed when a ring 281 * This records the minimum fill level observed when a ring
268 * refill was triggered. 282 * refill was triggered.
269 * @alloc_page_count: RX allocation strategy counter. 283 * @recycle_count: RX buffer recycle counter.
270 * @alloc_skb_count: RX allocation strategy counter.
271 * @slow_fill: Timer used to defer efx_nic_generate_fill_event(). 284 * @slow_fill: Timer used to defer efx_nic_generate_fill_event().
272 */ 285 */
273struct efx_rx_queue { 286struct efx_rx_queue {
@@ -279,15 +292,22 @@ struct efx_rx_queue {
279 bool enabled; 292 bool enabled;
280 bool flush_pending; 293 bool flush_pending;
281 294
282 int added_count; 295 unsigned int added_count;
283 int notified_count; 296 unsigned int notified_count;
284 int removed_count; 297 unsigned int removed_count;
298 unsigned int scatter_n;
299 struct page **page_ring;
300 unsigned int page_add;
301 unsigned int page_remove;
302 unsigned int page_recycle_count;
303 unsigned int page_recycle_failed;
304 unsigned int page_recycle_full;
305 unsigned int page_ptr_mask;
285 unsigned int max_fill; 306 unsigned int max_fill;
286 unsigned int fast_fill_trigger; 307 unsigned int fast_fill_trigger;
287 unsigned int min_fill; 308 unsigned int min_fill;
288 unsigned int min_overfill; 309 unsigned int min_overfill;
289 unsigned int alloc_page_count; 310 unsigned int recycle_count;
290 unsigned int alloc_skb_count;
291 struct timer_list slow_fill; 311 struct timer_list slow_fill;
292 unsigned int slow_fill_count; 312 unsigned int slow_fill_count;
293}; 313};
@@ -336,10 +356,6 @@ enum efx_rx_alloc_method {
336 * @event_test_cpu: Last CPU to handle interrupt or test event for this channel 356 * @event_test_cpu: Last CPU to handle interrupt or test event for this channel
337 * @irq_count: Number of IRQs since last adaptive moderation decision 357 * @irq_count: Number of IRQs since last adaptive moderation decision
338 * @irq_mod_score: IRQ moderation score 358 * @irq_mod_score: IRQ moderation score
339 * @rx_alloc_level: Watermark based heuristic counter for pushing descriptors
340 * and diagnostic counters
341 * @rx_alloc_push_pages: RX allocation method currently in use for pushing
342 * descriptors
343 * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors 359 * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors
344 * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors 360 * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors
345 * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors 361 * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors
@@ -347,6 +363,12 @@ enum efx_rx_alloc_method {
347 * @n_rx_frm_trunc: Count of RX_FRM_TRUNC errors 363 * @n_rx_frm_trunc: Count of RX_FRM_TRUNC errors
348 * @n_rx_overlength: Count of RX_OVERLENGTH errors 364 * @n_rx_overlength: Count of RX_OVERLENGTH errors
349 * @n_skbuff_leaks: Count of skbuffs leaked due to RX overrun 365 * @n_skbuff_leaks: Count of skbuffs leaked due to RX overrun
366 * @n_rx_nodesc_trunc: Number of RX packets truncated and then dropped due to
367 * lack of descriptors
368 * @rx_pkt_n_frags: Number of fragments in next packet to be delivered by
369 * __efx_rx_packet(), or zero if there is none
370 * @rx_pkt_index: Ring index of first buffer for next packet to be delivered
371 * by __efx_rx_packet(), if @rx_pkt_n_frags != 0
350 * @rx_queue: RX queue for this channel 372 * @rx_queue: RX queue for this channel
351 * @tx_queue: TX queues for this channel 373 * @tx_queue: TX queues for this channel
352 */ 374 */
@@ -371,9 +393,6 @@ struct efx_channel {
371 unsigned int rfs_filters_added; 393 unsigned int rfs_filters_added;
372#endif 394#endif
373 395
374 int rx_alloc_level;
375 int rx_alloc_push_pages;
376
377 unsigned n_rx_tobe_disc; 396 unsigned n_rx_tobe_disc;
378 unsigned n_rx_ip_hdr_chksum_err; 397 unsigned n_rx_ip_hdr_chksum_err;
379 unsigned n_rx_tcp_udp_chksum_err; 398 unsigned n_rx_tcp_udp_chksum_err;
@@ -381,11 +400,10 @@ struct efx_channel {
381 unsigned n_rx_frm_trunc; 400 unsigned n_rx_frm_trunc;
382 unsigned n_rx_overlength; 401 unsigned n_rx_overlength;
383 unsigned n_skbuff_leaks; 402 unsigned n_skbuff_leaks;
403 unsigned int n_rx_nodesc_trunc;
384 404
385 /* Used to pipeline received packets in order to optimise memory 405 unsigned int rx_pkt_n_frags;
386 * access with prefetches. 406 unsigned int rx_pkt_index;
387 */
388 struct efx_rx_buffer *rx_pkt;
389 407
390 struct efx_rx_queue rx_queue; 408 struct efx_rx_queue rx_queue;
391 struct efx_tx_queue tx_queue[EFX_TXQ_TYPES]; 409 struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
@@ -410,7 +428,7 @@ struct efx_channel_type {
410 void (*post_remove)(struct efx_channel *); 428 void (*post_remove)(struct efx_channel *);
411 void (*get_name)(struct efx_channel *, char *buf, size_t len); 429 void (*get_name)(struct efx_channel *, char *buf, size_t len);
412 struct efx_channel *(*copy)(const struct efx_channel *); 430 struct efx_channel *(*copy)(const struct efx_channel *);
413 void (*receive_skb)(struct efx_channel *, struct sk_buff *); 431 bool (*receive_skb)(struct efx_channel *, struct sk_buff *);
414 bool keep_eventq; 432 bool keep_eventq;
415}; 433};
416 434
@@ -446,6 +464,7 @@ enum nic_state {
446 STATE_UNINIT = 0, /* device being probed/removed or is frozen */ 464 STATE_UNINIT = 0, /* device being probed/removed or is frozen */
447 STATE_READY = 1, /* hardware ready and netdev registered */ 465 STATE_READY = 1, /* hardware ready and netdev registered */
448 STATE_DISABLED = 2, /* device disabled due to hardware errors */ 466 STATE_DISABLED = 2, /* device disabled due to hardware errors */
467 STATE_RECOVERY = 3, /* device recovering from PCI error */
449}; 468};
450 469
451/* 470/*
@@ -684,10 +703,13 @@ struct vfdi_status;
684 * @n_channels: Number of channels in use 703 * @n_channels: Number of channels in use
685 * @n_rx_channels: Number of channels used for RX (= number of RX queues) 704 * @n_rx_channels: Number of channels used for RX (= number of RX queues)
686 * @n_tx_channels: Number of channels used for TX 705 * @n_tx_channels: Number of channels used for TX
687 * @rx_buffer_len: RX buffer length 706 * @rx_dma_len: Current maximum RX DMA length
688 * @rx_buffer_order: Order (log2) of number of pages for each RX buffer 707 * @rx_buffer_order: Order (log2) of number of pages for each RX buffer
708 * @rx_buffer_truesize: Amortised allocation size of an RX buffer,
709 * for use in sk_buff::truesize
689 * @rx_hash_key: Toeplitz hash key for RSS 710 * @rx_hash_key: Toeplitz hash key for RSS
690 * @rx_indir_table: Indirection table for RSS 711 * @rx_indir_table: Indirection table for RSS
712 * @rx_scatter: Scatter mode enabled for receives
691 * @int_error_count: Number of internal errors seen recently 713 * @int_error_count: Number of internal errors seen recently
692 * @int_error_expire: Time at which error count will be expired 714 * @int_error_expire: Time at which error count will be expired
693 * @irq_status: Interrupt status buffer 715 * @irq_status: Interrupt status buffer
@@ -800,10 +822,15 @@ struct efx_nic {
800 unsigned rss_spread; 822 unsigned rss_spread;
801 unsigned tx_channel_offset; 823 unsigned tx_channel_offset;
802 unsigned n_tx_channels; 824 unsigned n_tx_channels;
803 unsigned int rx_buffer_len; 825 unsigned int rx_dma_len;
804 unsigned int rx_buffer_order; 826 unsigned int rx_buffer_order;
827 unsigned int rx_buffer_truesize;
828 unsigned int rx_page_buf_step;
829 unsigned int rx_bufs_per_page;
830 unsigned int rx_pages_per_batch;
805 u8 rx_hash_key[40]; 831 u8 rx_hash_key[40];
806 u32 rx_indir_table[128]; 832 u32 rx_indir_table[128];
833 bool rx_scatter;
807 834
808 unsigned int_error_count; 835 unsigned int_error_count;
809 unsigned long int_error_expire; 836 unsigned long int_error_expire;
@@ -934,8 +961,9 @@ static inline unsigned int efx_port_num(struct efx_nic *efx)
934 * @evq_ptr_tbl_base: Event queue pointer table base address 961 * @evq_ptr_tbl_base: Event queue pointer table base address
935 * @evq_rptr_tbl_base: Event queue read-pointer table base address 962 * @evq_rptr_tbl_base: Event queue read-pointer table base address
936 * @max_dma_mask: Maximum possible DMA mask 963 * @max_dma_mask: Maximum possible DMA mask
937 * @rx_buffer_hash_size: Size of hash at start of RX buffer 964 * @rx_buffer_hash_size: Size of hash at start of RX packet
938 * @rx_buffer_padding: Size of padding at end of RX buffer 965 * @rx_buffer_padding: Size of padding at end of RX packet
966 * @can_rx_scatter: NIC is able to scatter packet to multiple buffers
939 * @max_interrupt_mode: Highest capability interrupt mode supported 967 * @max_interrupt_mode: Highest capability interrupt mode supported
940 * from &enum efx_init_mode. 968 * from &enum efx_init_mode.
941 * @phys_addr_channels: Number of channels with physically addressed 969 * @phys_addr_channels: Number of channels with physically addressed
@@ -983,6 +1011,7 @@ struct efx_nic_type {
983 u64 max_dma_mask; 1011 u64 max_dma_mask;
984 unsigned int rx_buffer_hash_size; 1012 unsigned int rx_buffer_hash_size;
985 unsigned int rx_buffer_padding; 1013 unsigned int rx_buffer_padding;
1014 bool can_rx_scatter;
986 unsigned int max_interrupt_mode; 1015 unsigned int max_interrupt_mode;
987 unsigned int phys_addr_channels; 1016 unsigned int phys_addr_channels;
988 unsigned int timer_period_max; 1017 unsigned int timer_period_max;
diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c
index 0ad790cc473c..f9f5df8b51fe 100644
--- a/drivers/net/ethernet/sfc/nic.c
+++ b/drivers/net/ethernet/sfc/nic.c
@@ -591,12 +591,22 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
591 struct efx_nic *efx = rx_queue->efx; 591 struct efx_nic *efx = rx_queue->efx;
592 bool is_b0 = efx_nic_rev(efx) >= EFX_REV_FALCON_B0; 592 bool is_b0 = efx_nic_rev(efx) >= EFX_REV_FALCON_B0;
593 bool iscsi_digest_en = is_b0; 593 bool iscsi_digest_en = is_b0;
594 bool jumbo_en;
595
596 /* For kernel-mode queues in Falcon A1, the JUMBO flag enables
597 * DMA to continue after a PCIe page boundary (and scattering
598 * is not possible). In Falcon B0 and Siena, it enables
599 * scatter.
600 */
601 jumbo_en = !is_b0 || efx->rx_scatter;
594 602
595 netif_dbg(efx, hw, efx->net_dev, 603 netif_dbg(efx, hw, efx->net_dev,
596 "RX queue %d ring in special buffers %d-%d\n", 604 "RX queue %d ring in special buffers %d-%d\n",
597 efx_rx_queue_index(rx_queue), rx_queue->rxd.index, 605 efx_rx_queue_index(rx_queue), rx_queue->rxd.index,
598 rx_queue->rxd.index + rx_queue->rxd.entries - 1); 606 rx_queue->rxd.index + rx_queue->rxd.entries - 1);
599 607
608 rx_queue->scatter_n = 0;
609
600 /* Pin RX descriptor ring */ 610 /* Pin RX descriptor ring */
601 efx_init_special_buffer(efx, &rx_queue->rxd); 611 efx_init_special_buffer(efx, &rx_queue->rxd);
602 612
@@ -613,8 +623,7 @@ void efx_nic_init_rx(struct efx_rx_queue *rx_queue)
613 FRF_AZ_RX_DESCQ_SIZE, 623 FRF_AZ_RX_DESCQ_SIZE,
614 __ffs(rx_queue->rxd.entries), 624 __ffs(rx_queue->rxd.entries),
615 FRF_AZ_RX_DESCQ_TYPE, 0 /* kernel queue */ , 625 FRF_AZ_RX_DESCQ_TYPE, 0 /* kernel queue */ ,
616 /* For >=B0 this is scatter so disable */ 626 FRF_AZ_RX_DESCQ_JUMBO, jumbo_en,
617 FRF_AZ_RX_DESCQ_JUMBO, !is_b0,
618 FRF_AZ_RX_DESCQ_EN, 1); 627 FRF_AZ_RX_DESCQ_EN, 1);
619 efx_writeo_table(efx, &rx_desc_ptr, efx->type->rxd_ptr_tbl_base, 628 efx_writeo_table(efx, &rx_desc_ptr, efx->type->rxd_ptr_tbl_base,
620 efx_rx_queue_index(rx_queue)); 629 efx_rx_queue_index(rx_queue));
@@ -968,13 +977,24 @@ static u16 efx_handle_rx_not_ok(struct efx_rx_queue *rx_queue,
968 EFX_RX_PKT_DISCARD : 0; 977 EFX_RX_PKT_DISCARD : 0;
969} 978}
970 979
971/* Handle receive events that are not in-order. */ 980/* Handle receive events that are not in-order. Return true if this
972static void 981 * can be handled as a partial packet discard, false if it's more
982 * serious.
983 */
984static bool
973efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index) 985efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)
974{ 986{
987 struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
975 struct efx_nic *efx = rx_queue->efx; 988 struct efx_nic *efx = rx_queue->efx;
976 unsigned expected, dropped; 989 unsigned expected, dropped;
977 990
991 if (rx_queue->scatter_n &&
992 index == ((rx_queue->removed_count + rx_queue->scatter_n - 1) &
993 rx_queue->ptr_mask)) {
994 ++channel->n_rx_nodesc_trunc;
995 return true;
996 }
997
978 expected = rx_queue->removed_count & rx_queue->ptr_mask; 998 expected = rx_queue->removed_count & rx_queue->ptr_mask;
979 dropped = (index - expected) & rx_queue->ptr_mask; 999 dropped = (index - expected) & rx_queue->ptr_mask;
980 netif_info(efx, rx_err, efx->net_dev, 1000 netif_info(efx, rx_err, efx->net_dev,
@@ -983,6 +1003,7 @@ efx_handle_rx_bad_index(struct efx_rx_queue *rx_queue, unsigned index)
983 1003
984 efx_schedule_reset(efx, EFX_WORKAROUND_5676(efx) ? 1004 efx_schedule_reset(efx, EFX_WORKAROUND_5676(efx) ?
985 RESET_TYPE_RX_RECOVERY : RESET_TYPE_DISABLE); 1005 RESET_TYPE_RX_RECOVERY : RESET_TYPE_DISABLE);
1006 return false;
986} 1007}
987 1008
988/* Handle a packet received event 1009/* Handle a packet received event
@@ -998,7 +1019,7 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
998 unsigned int rx_ev_desc_ptr, rx_ev_byte_cnt; 1019 unsigned int rx_ev_desc_ptr, rx_ev_byte_cnt;
999 unsigned int rx_ev_hdr_type, rx_ev_mcast_pkt; 1020 unsigned int rx_ev_hdr_type, rx_ev_mcast_pkt;
1000 unsigned expected_ptr; 1021 unsigned expected_ptr;
1001 bool rx_ev_pkt_ok; 1022 bool rx_ev_pkt_ok, rx_ev_sop, rx_ev_cont;
1002 u16 flags; 1023 u16 flags;
1003 struct efx_rx_queue *rx_queue; 1024 struct efx_rx_queue *rx_queue;
1004 struct efx_nic *efx = channel->efx; 1025 struct efx_nic *efx = channel->efx;
@@ -1006,21 +1027,56 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
1006 if (unlikely(ACCESS_ONCE(efx->reset_pending))) 1027 if (unlikely(ACCESS_ONCE(efx->reset_pending)))
1007 return; 1028 return;
1008 1029
1009 /* Basic packet information */ 1030 rx_ev_cont = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT);
1010 rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT); 1031 rx_ev_sop = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP);
1011 rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK);
1012 rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE);
1013 WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT));
1014 WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_SOP) != 1);
1015 WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_Q_LABEL) != 1032 WARN_ON(EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_Q_LABEL) !=
1016 channel->channel); 1033 channel->channel);
1017 1034
1018 rx_queue = efx_channel_get_rx_queue(channel); 1035 rx_queue = efx_channel_get_rx_queue(channel);
1019 1036
1020 rx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_DESC_PTR); 1037 rx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_DESC_PTR);
1021 expected_ptr = rx_queue->removed_count & rx_queue->ptr_mask; 1038 expected_ptr = ((rx_queue->removed_count + rx_queue->scatter_n) &
1022 if (unlikely(rx_ev_desc_ptr != expected_ptr)) 1039 rx_queue->ptr_mask);
1023 efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr); 1040
1041 /* Check for partial drops and other errors */
1042 if (unlikely(rx_ev_desc_ptr != expected_ptr) ||
1043 unlikely(rx_ev_sop != (rx_queue->scatter_n == 0))) {
1044 if (rx_ev_desc_ptr != expected_ptr &&
1045 !efx_handle_rx_bad_index(rx_queue, rx_ev_desc_ptr))
1046 return;
1047
1048 /* Discard all pending fragments */
1049 if (rx_queue->scatter_n) {
1050 efx_rx_packet(
1051 rx_queue,
1052 rx_queue->removed_count & rx_queue->ptr_mask,
1053 rx_queue->scatter_n, 0, EFX_RX_PKT_DISCARD);
1054 rx_queue->removed_count += rx_queue->scatter_n;
1055 rx_queue->scatter_n = 0;
1056 }
1057
1058 /* Return if there is no new fragment */
1059 if (rx_ev_desc_ptr != expected_ptr)
1060 return;
1061
1062 /* Discard new fragment if not SOP */
1063 if (!rx_ev_sop) {
1064 efx_rx_packet(
1065 rx_queue,
1066 rx_queue->removed_count & rx_queue->ptr_mask,
1067 1, 0, EFX_RX_PKT_DISCARD);
1068 ++rx_queue->removed_count;
1069 return;
1070 }
1071 }
1072
1073 ++rx_queue->scatter_n;
1074 if (rx_ev_cont)
1075 return;
1076
1077 rx_ev_byte_cnt = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_BYTE_CNT);
1078 rx_ev_pkt_ok = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_PKT_OK);
1079 rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE);
1024 1080
1025 if (likely(rx_ev_pkt_ok)) { 1081 if (likely(rx_ev_pkt_ok)) {
1026 /* If packet is marked as OK and packet type is TCP/IP or 1082 /* If packet is marked as OK and packet type is TCP/IP or
@@ -1048,7 +1104,11 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
1048 channel->irq_mod_score += 2; 1104 channel->irq_mod_score += 2;
1049 1105
1050 /* Handle received packet */ 1106 /* Handle received packet */
1051 efx_rx_packet(rx_queue, rx_ev_desc_ptr, rx_ev_byte_cnt, flags); 1107 efx_rx_packet(rx_queue,
1108 rx_queue->removed_count & rx_queue->ptr_mask,
1109 rx_queue->scatter_n, rx_ev_byte_cnt, flags);
1110 rx_queue->removed_count += rx_queue->scatter_n;
1111 rx_queue->scatter_n = 0;
1052} 1112}
1053 1113
1054/* If this flush done event corresponds to a &struct efx_tx_queue, then 1114/* If this flush done event corresponds to a &struct efx_tx_queue, then
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 3f93624fc273..07f6baa15c0c 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -99,6 +99,9 @@
99#define PTP_V2_VERSION_LENGTH 1 99#define PTP_V2_VERSION_LENGTH 1
100#define PTP_V2_VERSION_OFFSET 29 100#define PTP_V2_VERSION_OFFSET 29
101 101
102#define PTP_V2_UUID_LENGTH 8
103#define PTP_V2_UUID_OFFSET 48
104
102/* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2), 105/* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2),
103 * the MC only captures the last six bytes of the clock identity. These values 106 * the MC only captures the last six bytes of the clock identity. These values
104 * reflect those, not the ones used in the standard. The standard permits 107 * reflect those, not the ones used in the standard. The standard permits
@@ -429,13 +432,10 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
429 unsigned number_readings = (response_length / 432 unsigned number_readings = (response_length /
430 MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN); 433 MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN);
431 unsigned i; 434 unsigned i;
432 unsigned min;
433 unsigned min_set = 0;
434 unsigned total; 435 unsigned total;
435 unsigned ngood = 0; 436 unsigned ngood = 0;
436 unsigned last_good = 0; 437 unsigned last_good = 0;
437 struct efx_ptp_data *ptp = efx->ptp_data; 438 struct efx_ptp_data *ptp = efx->ptp_data;
438 bool min_valid = false;
439 u32 last_sec; 439 u32 last_sec;
440 u32 start_sec; 440 u32 start_sec;
441 struct timespec delta; 441 struct timespec delta;
@@ -443,35 +443,17 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
443 if (number_readings == 0) 443 if (number_readings == 0)
444 return -EAGAIN; 444 return -EAGAIN;
445 445
446 /* Find minimum value in this set of results, discarding clearly 446 /* Read the set of results and increment stats for any results that
447 * erroneous results. 447 * appera to be erroneous.
448 */ 448 */
449 for (i = 0; i < number_readings; i++) { 449 for (i = 0; i < number_readings; i++) {
450 efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]); 450 efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]);
451 synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN; 451 synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN;
452 if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) {
453 if (min_valid) {
454 if (ptp->timeset[i].window < min_set)
455 min_set = ptp->timeset[i].window;
456 } else {
457 min_valid = true;
458 min_set = ptp->timeset[i].window;
459 }
460 }
461 }
462
463 if (min_valid) {
464 if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns))
465 min = ptp->base_sync_ns;
466 else
467 min = min_set;
468 } else {
469 min = SYNCHRONISATION_GRANULARITY_NS;
470 } 452 }
471 453
472 /* Discard excessively long synchronise durations. The MC times 454 /* Find the last good host-MC synchronization result. The MC times
473 * when it finishes reading the host time so the corrected window 455 * when it finishes reading the host time so the corrected window time
474 * time should be fairly constant for a given platform. 456 * should be fairly constant for a given platform.
475 */ 457 */
476 total = 0; 458 total = 0;
477 for (i = 0; i < number_readings; i++) 459 for (i = 0; i < number_readings; i++)
@@ -489,8 +471,8 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf,
489 471
490 if (ngood == 0) { 472 if (ngood == 0) {
491 netif_warn(efx, drv, efx->net_dev, 473 netif_warn(efx, drv, efx->net_dev,
492 "PTP no suitable synchronisations %dns %dns\n", 474 "PTP no suitable synchronisations %dns\n",
493 ptp->base_sync_ns, min_set); 475 ptp->base_sync_ns);
494 return -EAGAIN; 476 return -EAGAIN;
495 } 477 }
496 478
@@ -1006,43 +988,53 @@ bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb)
1006 * the receive timestamp from the MC - this will probably occur after the 988 * the receive timestamp from the MC - this will probably occur after the
1007 * packet arrival because of the processing in the MC. 989 * packet arrival because of the processing in the MC.
1008 */ 990 */
1009static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) 991static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
1010{ 992{
1011 struct efx_nic *efx = channel->efx; 993 struct efx_nic *efx = channel->efx;
1012 struct efx_ptp_data *ptp = efx->ptp_data; 994 struct efx_ptp_data *ptp = efx->ptp_data;
1013 struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb; 995 struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb;
1014 u8 *data; 996 u8 *match_data_012, *match_data_345;
1015 unsigned int version; 997 unsigned int version;
1016 998
1017 match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS); 999 match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS);
1018 1000
1019 /* Correct version? */ 1001 /* Correct version? */
1020 if (ptp->mode == MC_CMD_PTP_MODE_V1) { 1002 if (ptp->mode == MC_CMD_PTP_MODE_V1) {
1021 if (skb->len < PTP_V1_MIN_LENGTH) { 1003 if (!pskb_may_pull(skb, PTP_V1_MIN_LENGTH)) {
1022 netif_receive_skb(skb); 1004 return false;
1023 return;
1024 } 1005 }
1025 version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]); 1006 version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]);
1026 if (version != PTP_VERSION_V1) { 1007 if (version != PTP_VERSION_V1) {
1027 netif_receive_skb(skb); 1008 return false;
1028 return;
1029 } 1009 }
1010
1011 /* PTP V1 uses all six bytes of the UUID to match the packet
1012 * to the timestamp
1013 */
1014 match_data_012 = skb->data + PTP_V1_UUID_OFFSET;
1015 match_data_345 = skb->data + PTP_V1_UUID_OFFSET + 3;
1030 } else { 1016 } else {
1031 if (skb->len < PTP_V2_MIN_LENGTH) { 1017 if (!pskb_may_pull(skb, PTP_V2_MIN_LENGTH)) {
1032 netif_receive_skb(skb); 1018 return false;
1033 return;
1034 } 1019 }
1035 version = skb->data[PTP_V2_VERSION_OFFSET]; 1020 version = skb->data[PTP_V2_VERSION_OFFSET];
1036
1037 BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2);
1038 BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET);
1039 BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH);
1040 BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
1041 BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
1042
1043 if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) { 1021 if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) {
1044 netif_receive_skb(skb); 1022 return false;
1045 return; 1023 }
1024
1025 /* The original V2 implementation uses bytes 2-7 of
1026 * the UUID to match the packet to the timestamp. This
1027 * discards two of the bytes of the MAC address used
1028 * to create the UUID (SF bug 33070). The PTP V2
1029 * enhanced mode fixes this issue and uses bytes 0-2
1030 * and byte 5-7 of the UUID.
1031 */
1032 match_data_345 = skb->data + PTP_V2_UUID_OFFSET + 5;
1033 if (ptp->mode == MC_CMD_PTP_MODE_V2) {
1034 match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 2;
1035 } else {
1036 match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 0;
1037 BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2_ENHANCED);
1046 } 1038 }
1047 } 1039 }
1048 1040
@@ -1056,14 +1048,19 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
1056 timestamps = skb_hwtstamps(skb); 1048 timestamps = skb_hwtstamps(skb);
1057 memset(timestamps, 0, sizeof(*timestamps)); 1049 memset(timestamps, 0, sizeof(*timestamps));
1058 1050
1051 /* We expect the sequence number to be in the same position in
1052 * the packet for PTP V1 and V2
1053 */
1054 BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET);
1055 BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH);
1056
1059 /* Extract UUID/Sequence information */ 1057 /* Extract UUID/Sequence information */
1060 data = skb->data + PTP_V1_UUID_OFFSET; 1058 match->words[0] = (match_data_012[0] |
1061 match->words[0] = (data[0] | 1059 (match_data_012[1] << 8) |
1062 (data[1] << 8) | 1060 (match_data_012[2] << 16) |
1063 (data[2] << 16) | 1061 (match_data_345[0] << 24));
1064 (data[3] << 24)); 1062 match->words[1] = (match_data_345[1] |
1065 match->words[1] = (data[4] | 1063 (match_data_345[2] << 8) |
1066 (data[5] << 8) |
1067 (skb->data[PTP_V1_SEQUENCE_OFFSET + 1064 (skb->data[PTP_V1_SEQUENCE_OFFSET +
1068 PTP_V1_SEQUENCE_LENGTH - 1] << 1065 PTP_V1_SEQUENCE_LENGTH - 1] <<
1069 16)); 1066 16));
@@ -1073,6 +1070,8 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb)
1073 1070
1074 skb_queue_tail(&ptp->rxq, skb); 1071 skb_queue_tail(&ptp->rxq, skb);
1075 queue_work(ptp->workwq, &ptp->work); 1072 queue_work(ptp->workwq, &ptp->work);
1073
1074 return true;
1076} 1075}
1077 1076
1078/* Transmit a PTP packet. This has to be transmitted by the MC 1077/* Transmit a PTP packet. This has to be transmitted by the MC
@@ -1167,7 +1166,7 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
1167 * timestamped 1166 * timestamped
1168 */ 1167 */
1169 init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT; 1168 init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
1170 new_mode = MC_CMD_PTP_MODE_V2; 1169 new_mode = MC_CMD_PTP_MODE_V2_ENHANCED;
1171 enable_wanted = true; 1170 enable_wanted = true;
1172 break; 1171 break;
1173 case HWTSTAMP_FILTER_PTP_V2_EVENT: 1172 case HWTSTAMP_FILTER_PTP_V2_EVENT:
@@ -1186,7 +1185,14 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init)
1186 if (init->tx_type != HWTSTAMP_TX_OFF) 1185 if (init->tx_type != HWTSTAMP_TX_OFF)
1187 enable_wanted = true; 1186 enable_wanted = true;
1188 1187
1188 /* Old versions of the firmware do not support the improved
1189 * UUID filtering option (SF bug 33070). If the firmware does
1190 * not accept the enhanced mode, fall back to the standard PTP
1191 * v2 UUID filtering.
1192 */
1189 rc = efx_ptp_change_mode(efx, enable_wanted, new_mode); 1193 rc = efx_ptp_change_mode(efx, enable_wanted, new_mode);
1194 if ((rc != 0) && (new_mode == MC_CMD_PTP_MODE_V2_ENHANCED))
1195 rc = efx_ptp_change_mode(efx, enable_wanted, MC_CMD_PTP_MODE_V2);
1190 if (rc != 0) 1196 if (rc != 0)
1191 return rc; 1197 return rc;
1192 1198
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 879ff5849bbd..a948b36c1910 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -16,6 +16,7 @@
16#include <linux/udp.h> 16#include <linux/udp.h>
17#include <linux/prefetch.h> 17#include <linux/prefetch.h>
18#include <linux/moduleparam.h> 18#include <linux/moduleparam.h>
19#include <linux/iommu.h>
19#include <net/ip.h> 20#include <net/ip.h>
20#include <net/checksum.h> 21#include <net/checksum.h>
21#include "net_driver.h" 22#include "net_driver.h"
@@ -24,85 +25,39 @@
24#include "selftest.h" 25#include "selftest.h"
25#include "workarounds.h" 26#include "workarounds.h"
26 27
27/* Number of RX descriptors pushed at once. */ 28/* Preferred number of descriptors to fill at once */
28#define EFX_RX_BATCH 8 29#define EFX_RX_PREFERRED_BATCH 8U
29 30
30/* Maximum size of a buffer sharing a page */ 31/* Number of RX buffers to recycle pages for. When creating the RX page recycle
31#define EFX_RX_HALF_PAGE ((PAGE_SIZE >> 1) - sizeof(struct efx_rx_page_state)) 32 * ring, this number is divided by the number of buffers per page to calculate
33 * the number of pages to store in the RX page recycle ring.
34 */
35#define EFX_RECYCLE_RING_SIZE_IOMMU 4096
36#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH)
32 37
33/* Size of buffer allocated for skb header area. */ 38/* Size of buffer allocated for skb header area. */
34#define EFX_SKB_HEADERS 64u 39#define EFX_SKB_HEADERS 64u
35 40
36/*
37 * rx_alloc_method - RX buffer allocation method
38 *
39 * This driver supports two methods for allocating and using RX buffers:
40 * each RX buffer may be backed by an skb or by an order-n page.
41 *
42 * When GRO is in use then the second method has a lower overhead,
43 * since we don't have to allocate then free skbs on reassembled frames.
44 *
45 * Values:
46 * - RX_ALLOC_METHOD_AUTO = 0
47 * - RX_ALLOC_METHOD_SKB = 1
48 * - RX_ALLOC_METHOD_PAGE = 2
49 *
50 * The heuristic for %RX_ALLOC_METHOD_AUTO is a simple hysteresis count
51 * controlled by the parameters below.
52 *
53 * - Since pushing and popping descriptors are separated by the rx_queue
54 * size, so the watermarks should be ~rxd_size.
55 * - The performance win by using page-based allocation for GRO is less
56 * than the performance hit of using page-based allocation of non-GRO,
57 * so the watermarks should reflect this.
58 *
59 * Per channel we maintain a single variable, updated by each channel:
60 *
61 * rx_alloc_level += (gro_performed ? RX_ALLOC_FACTOR_GRO :
62 * RX_ALLOC_FACTOR_SKB)
63 * Per NAPI poll interval, we constrain rx_alloc_level to 0..MAX (which
64 * limits the hysteresis), and update the allocation strategy:
65 *
66 * rx_alloc_method = (rx_alloc_level > RX_ALLOC_LEVEL_GRO ?
67 * RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB)
68 */
69static int rx_alloc_method = RX_ALLOC_METHOD_AUTO;
70
71#define RX_ALLOC_LEVEL_GRO 0x2000
72#define RX_ALLOC_LEVEL_MAX 0x3000
73#define RX_ALLOC_FACTOR_GRO 1
74#define RX_ALLOC_FACTOR_SKB (-2)
75
76/* This is the percentage fill level below which new RX descriptors 41/* This is the percentage fill level below which new RX descriptors
77 * will be added to the RX descriptor ring. 42 * will be added to the RX descriptor ring.
78 */ 43 */
79static unsigned int rx_refill_threshold; 44static unsigned int rx_refill_threshold;
80 45
46/* Each packet can consume up to ceil(max_frame_len / buffer_size) buffers */
47#define EFX_RX_MAX_FRAGS DIV_ROUND_UP(EFX_MAX_FRAME_LEN(EFX_MAX_MTU), \
48 EFX_RX_USR_BUF_SIZE)
49
81/* 50/*
82 * RX maximum head room required. 51 * RX maximum head room required.
83 * 52 *
84 * This must be at least 1 to prevent overflow and at least 2 to allow 53 * This must be at least 1 to prevent overflow, plus one packet-worth
85 * pipelined receives. 54 * to allow pipelined receives.
86 */ 55 */
87#define EFX_RXD_HEAD_ROOM 2 56#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS)
88 57
89/* Offset of ethernet header within page */ 58static inline u8 *efx_rx_buf_va(struct efx_rx_buffer *buf)
90static inline unsigned int efx_rx_buf_offset(struct efx_nic *efx,
91 struct efx_rx_buffer *buf)
92{ 59{
93 return buf->page_offset + efx->type->rx_buffer_hash_size; 60 return page_address(buf->page) + buf->page_offset;
94}
95static inline unsigned int efx_rx_buf_size(struct efx_nic *efx)
96{
97 return PAGE_SIZE << efx->rx_buffer_order;
98}
99
100static u8 *efx_rx_buf_eh(struct efx_nic *efx, struct efx_rx_buffer *buf)
101{
102 if (buf->flags & EFX_RX_BUF_PAGE)
103 return page_address(buf->u.page) + efx_rx_buf_offset(efx, buf);
104 else
105 return (u8 *)buf->u.skb->data + efx->type->rx_buffer_hash_size;
106} 61}
107 62
108static inline u32 efx_rx_buf_hash(const u8 *eh) 63static inline u32 efx_rx_buf_hash(const u8 *eh)
@@ -119,66 +74,81 @@ static inline u32 efx_rx_buf_hash(const u8 *eh)
119#endif 74#endif
120} 75}
121 76
122/** 77static inline struct efx_rx_buffer *
123 * efx_init_rx_buffers_skb - create EFX_RX_BATCH skb-based RX buffers 78efx_rx_buf_next(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf)
124 * 79{
125 * @rx_queue: Efx RX queue 80 if (unlikely(rx_buf == efx_rx_buffer(rx_queue, rx_queue->ptr_mask)))
126 * 81 return efx_rx_buffer(rx_queue, 0);
127 * This allocates EFX_RX_BATCH skbs, maps them for DMA, and populates a 82 else
128 * struct efx_rx_buffer for each one. Return a negative error code or 0 83 return rx_buf + 1;
129 * on success. May fail having only inserted fewer than EFX_RX_BATCH 84}
130 * buffers. 85
131 */ 86static inline void efx_sync_rx_buffer(struct efx_nic *efx,
132static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue) 87 struct efx_rx_buffer *rx_buf,
88 unsigned int len)
89{
90 dma_sync_single_for_cpu(&efx->pci_dev->dev, rx_buf->dma_addr, len,
91 DMA_FROM_DEVICE);
92}
93
94void efx_rx_config_page_split(struct efx_nic *efx)
95{
96 efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + EFX_PAGE_IP_ALIGN,
97 L1_CACHE_BYTES);
98 efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 :
99 ((PAGE_SIZE - sizeof(struct efx_rx_page_state)) /
100 efx->rx_page_buf_step);
101 efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) /
102 efx->rx_bufs_per_page;
103 efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH,
104 efx->rx_bufs_per_page);
105}
106
107/* Check the RX page recycle ring for a page that can be reused. */
108static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
133{ 109{
134 struct efx_nic *efx = rx_queue->efx; 110 struct efx_nic *efx = rx_queue->efx;
135 struct net_device *net_dev = efx->net_dev; 111 struct page *page;
136 struct efx_rx_buffer *rx_buf; 112 struct efx_rx_page_state *state;
137 struct sk_buff *skb; 113 unsigned index;
138 int skb_len = efx->rx_buffer_len;
139 unsigned index, count;
140 114
141 for (count = 0; count < EFX_RX_BATCH; ++count) { 115 index = rx_queue->page_remove & rx_queue->page_ptr_mask;
142 index = rx_queue->added_count & rx_queue->ptr_mask; 116 page = rx_queue->page_ring[index];
143 rx_buf = efx_rx_buffer(rx_queue, index); 117 if (page == NULL)
144 118 return NULL;
145 rx_buf->u.skb = skb = netdev_alloc_skb(net_dev, skb_len); 119
146 if (unlikely(!skb)) 120 rx_queue->page_ring[index] = NULL;
147 return -ENOMEM; 121 /* page_remove cannot exceed page_add. */
148 122 if (rx_queue->page_remove != rx_queue->page_add)
149 /* Adjust the SKB for padding */ 123 ++rx_queue->page_remove;
150 skb_reserve(skb, NET_IP_ALIGN);
151 rx_buf->len = skb_len - NET_IP_ALIGN;
152 rx_buf->flags = 0;
153
154 rx_buf->dma_addr = dma_map_single(&efx->pci_dev->dev,
155 skb->data, rx_buf->len,
156 DMA_FROM_DEVICE);
157 if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
158 rx_buf->dma_addr))) {
159 dev_kfree_skb_any(skb);
160 rx_buf->u.skb = NULL;
161 return -EIO;
162 }
163 124
164 ++rx_queue->added_count; 125 /* If page_count is 1 then we hold the only reference to this page. */
165 ++rx_queue->alloc_skb_count; 126 if (page_count(page) == 1) {
127 ++rx_queue->page_recycle_count;
128 return page;
129 } else {
130 state = page_address(page);
131 dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
132 PAGE_SIZE << efx->rx_buffer_order,
133 DMA_FROM_DEVICE);
134 put_page(page);
135 ++rx_queue->page_recycle_failed;
166 } 136 }
167 137
168 return 0; 138 return NULL;
169} 139}
170 140
171/** 141/**
172 * efx_init_rx_buffers_page - create EFX_RX_BATCH page-based RX buffers 142 * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
173 * 143 *
174 * @rx_queue: Efx RX queue 144 * @rx_queue: Efx RX queue
175 * 145 *
176 * This allocates memory for EFX_RX_BATCH receive buffers, maps them for DMA, 146 * This allocates a batch of pages, maps them for DMA, and populates
177 * and populates struct efx_rx_buffers for each one. Return a negative error 147 * struct efx_rx_buffers for each one. Return a negative error code or
178 * code or 0 on success. If a single page can be split between two buffers, 148 * 0 on success. If a single page can be used for multiple buffers,
179 * then the page will either be inserted fully, or not at at all. 149 * then the page will either be inserted fully, or not at all.
180 */ 150 */
181static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue) 151static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue)
182{ 152{
183 struct efx_nic *efx = rx_queue->efx; 153 struct efx_nic *efx = rx_queue->efx;
184 struct efx_rx_buffer *rx_buf; 154 struct efx_rx_buffer *rx_buf;
@@ -188,150 +158,140 @@ static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)
188 dma_addr_t dma_addr; 158 dma_addr_t dma_addr;
189 unsigned index, count; 159 unsigned index, count;
190 160
191 /* We can split a page between two buffers */ 161 count = 0;
192 BUILD_BUG_ON(EFX_RX_BATCH & 1); 162 do {
193 163 page = efx_reuse_page(rx_queue);
194 for (count = 0; count < EFX_RX_BATCH; ++count) { 164 if (page == NULL) {
195 page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC, 165 page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
196 efx->rx_buffer_order); 166 efx->rx_buffer_order);
197 if (unlikely(page == NULL)) 167 if (unlikely(page == NULL))
198 return -ENOMEM; 168 return -ENOMEM;
199 dma_addr = dma_map_page(&efx->pci_dev->dev, page, 0, 169 dma_addr =
200 efx_rx_buf_size(efx), 170 dma_map_page(&efx->pci_dev->dev, page, 0,
201 DMA_FROM_DEVICE); 171 PAGE_SIZE << efx->rx_buffer_order,
202 if (unlikely(dma_mapping_error(&efx->pci_dev->dev, dma_addr))) { 172 DMA_FROM_DEVICE);
203 __free_pages(page, efx->rx_buffer_order); 173 if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
204 return -EIO; 174 dma_addr))) {
175 __free_pages(page, efx->rx_buffer_order);
176 return -EIO;
177 }
178 state = page_address(page);
179 state->dma_addr = dma_addr;
180 } else {
181 state = page_address(page);
182 dma_addr = state->dma_addr;
205 } 183 }
206 state = page_address(page);
207 state->refcnt = 0;
208 state->dma_addr = dma_addr;
209 184
210 dma_addr += sizeof(struct efx_rx_page_state); 185 dma_addr += sizeof(struct efx_rx_page_state);
211 page_offset = sizeof(struct efx_rx_page_state); 186 page_offset = sizeof(struct efx_rx_page_state);
212 187
213 split: 188 do {
214 index = rx_queue->added_count & rx_queue->ptr_mask; 189 index = rx_queue->added_count & rx_queue->ptr_mask;
215 rx_buf = efx_rx_buffer(rx_queue, index); 190 rx_buf = efx_rx_buffer(rx_queue, index);
216 rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN; 191 rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN;
217 rx_buf->u.page = page; 192 rx_buf->page = page;
218 rx_buf->page_offset = page_offset; 193 rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN;
219 rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN; 194 rx_buf->len = efx->rx_dma_len;
220 rx_buf->flags = EFX_RX_BUF_PAGE; 195 rx_buf->flags = 0;
221 ++rx_queue->added_count; 196 ++rx_queue->added_count;
222 ++rx_queue->alloc_page_count;
223 ++state->refcnt;
224
225 if ((~count & 1) && (efx->rx_buffer_len <= EFX_RX_HALF_PAGE)) {
226 /* Use the second half of the page */
227 get_page(page); 197 get_page(page);
228 dma_addr += (PAGE_SIZE >> 1); 198 dma_addr += efx->rx_page_buf_step;
229 page_offset += (PAGE_SIZE >> 1); 199 page_offset += efx->rx_page_buf_step;
230 ++count; 200 } while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE);
231 goto split; 201
232 } 202 rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE;
233 } 203 } while (++count < efx->rx_pages_per_batch);
234 204
235 return 0; 205 return 0;
236} 206}
237 207
208/* Unmap a DMA-mapped page. This function is only called for the final RX
209 * buffer in a page.
210 */
238static void efx_unmap_rx_buffer(struct efx_nic *efx, 211static void efx_unmap_rx_buffer(struct efx_nic *efx,
239 struct efx_rx_buffer *rx_buf, 212 struct efx_rx_buffer *rx_buf)
240 unsigned int used_len)
241{ 213{
242 if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { 214 struct page *page = rx_buf->page;
243 struct efx_rx_page_state *state; 215
244 216 if (page) {
245 state = page_address(rx_buf->u.page); 217 struct efx_rx_page_state *state = page_address(page);
246 if (--state->refcnt == 0) { 218 dma_unmap_page(&efx->pci_dev->dev,
247 dma_unmap_page(&efx->pci_dev->dev, 219 state->dma_addr,
248 state->dma_addr, 220 PAGE_SIZE << efx->rx_buffer_order,
249 efx_rx_buf_size(efx), 221 DMA_FROM_DEVICE);
250 DMA_FROM_DEVICE);
251 } else if (used_len) {
252 dma_sync_single_for_cpu(&efx->pci_dev->dev,
253 rx_buf->dma_addr, used_len,
254 DMA_FROM_DEVICE);
255 }
256 } else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) {
257 dma_unmap_single(&efx->pci_dev->dev, rx_buf->dma_addr,
258 rx_buf->len, DMA_FROM_DEVICE);
259 } 222 }
260} 223}
261 224
262static void efx_free_rx_buffer(struct efx_nic *efx, 225static void efx_free_rx_buffer(struct efx_rx_buffer *rx_buf)
263 struct efx_rx_buffer *rx_buf)
264{ 226{
265 if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { 227 if (rx_buf->page) {
266 __free_pages(rx_buf->u.page, efx->rx_buffer_order); 228 put_page(rx_buf->page);
267 rx_buf->u.page = NULL; 229 rx_buf->page = NULL;
268 } else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) {
269 dev_kfree_skb_any(rx_buf->u.skb);
270 rx_buf->u.skb = NULL;
271 } 230 }
272} 231}
273 232
274static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, 233/* Attempt to recycle the page if there is an RX recycle ring; the page can
275 struct efx_rx_buffer *rx_buf) 234 * only be added if this is the final RX buffer, to prevent pages being used in
235 * the descriptor ring and appearing in the recycle ring simultaneously.
236 */
237static void efx_recycle_rx_page(struct efx_channel *channel,
238 struct efx_rx_buffer *rx_buf)
276{ 239{
277 efx_unmap_rx_buffer(rx_queue->efx, rx_buf, 0); 240 struct page *page = rx_buf->page;
278 efx_free_rx_buffer(rx_queue->efx, rx_buf); 241 struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
279} 242 struct efx_nic *efx = rx_queue->efx;
243 unsigned index;
280 244
281/* Attempt to resurrect the other receive buffer that used to share this page, 245 /* Only recycle the page after processing the final buffer. */
282 * which had previously been passed up to the kernel and freed. */ 246 if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE))
283static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue,
284 struct efx_rx_buffer *rx_buf)
285{
286 struct efx_rx_page_state *state = page_address(rx_buf->u.page);
287 struct efx_rx_buffer *new_buf;
288 unsigned fill_level, index;
289
290 /* +1 because efx_rx_packet() incremented removed_count. +1 because
291 * we'd like to insert an additional descriptor whilst leaving
292 * EFX_RXD_HEAD_ROOM for the non-recycle path */
293 fill_level = (rx_queue->added_count - rx_queue->removed_count + 2);
294 if (unlikely(fill_level > rx_queue->max_fill)) {
295 /* We could place "state" on a list, and drain the list in
296 * efx_fast_push_rx_descriptors(). For now, this will do. */
297 return; 247 return;
298 }
299 248
300 ++state->refcnt; 249 index = rx_queue->page_add & rx_queue->page_ptr_mask;
301 get_page(rx_buf->u.page); 250 if (rx_queue->page_ring[index] == NULL) {
251 unsigned read_index = rx_queue->page_remove &
252 rx_queue->page_ptr_mask;
302 253
303 index = rx_queue->added_count & rx_queue->ptr_mask; 254 /* The next slot in the recycle ring is available, but
304 new_buf = efx_rx_buffer(rx_queue, index); 255 * increment page_remove if the read pointer currently
305 new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1); 256 * points here.
306 new_buf->u.page = rx_buf->u.page; 257 */
307 new_buf->len = rx_buf->len; 258 if (read_index == index)
308 new_buf->flags = EFX_RX_BUF_PAGE; 259 ++rx_queue->page_remove;
309 ++rx_queue->added_count; 260 rx_queue->page_ring[index] = page;
261 ++rx_queue->page_add;
262 return;
263 }
264 ++rx_queue->page_recycle_full;
265 efx_unmap_rx_buffer(efx, rx_buf);
266 put_page(rx_buf->page);
310} 267}
311 268
312/* Recycle the given rx buffer directly back into the rx_queue. There is 269static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
313 * always room to add this buffer, because we've just popped a buffer. */ 270 struct efx_rx_buffer *rx_buf)
314static void efx_recycle_rx_buffer(struct efx_channel *channel,
315 struct efx_rx_buffer *rx_buf)
316{ 271{
317 struct efx_nic *efx = channel->efx; 272 /* Release the page reference we hold for the buffer. */
318 struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel); 273 if (rx_buf->page)
319 struct efx_rx_buffer *new_buf; 274 put_page(rx_buf->page);
320 unsigned index; 275
321 276 /* If this is the last buffer in a page, unmap and free it. */
322 rx_buf->flags &= EFX_RX_BUF_PAGE; 277 if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) {
323 278 efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
324 if ((rx_buf->flags & EFX_RX_BUF_PAGE) && 279 efx_free_rx_buffer(rx_buf);
325 efx->rx_buffer_len <= EFX_RX_HALF_PAGE && 280 }
326 page_count(rx_buf->u.page) == 1) 281 rx_buf->page = NULL;
327 efx_resurrect_rx_buffer(rx_queue, rx_buf); 282}
328 283
329 index = rx_queue->added_count & rx_queue->ptr_mask; 284/* Recycle the pages that are used by buffers that have just been received. */
330 new_buf = efx_rx_buffer(rx_queue, index); 285static void efx_recycle_rx_buffers(struct efx_channel *channel,
286 struct efx_rx_buffer *rx_buf,
287 unsigned int n_frags)
288{
289 struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
331 290
332 memcpy(new_buf, rx_buf, sizeof(*new_buf)); 291 do {
333 rx_buf->u.page = NULL; 292 efx_recycle_rx_page(channel, rx_buf);
334 ++rx_queue->added_count; 293 rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
294 } while (--n_frags);
335} 295}
336 296
337/** 297/**
@@ -348,8 +308,8 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel,
348 */ 308 */
349void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue) 309void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
350{ 310{
351 struct efx_channel *channel = efx_rx_queue_channel(rx_queue); 311 struct efx_nic *efx = rx_queue->efx;
352 unsigned fill_level; 312 unsigned int fill_level, batch_size;
353 int space, rc = 0; 313 int space, rc = 0;
354 314
355 /* Calculate current fill level, and exit if we don't need to fill */ 315 /* Calculate current fill level, and exit if we don't need to fill */
@@ -364,28 +324,26 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
364 rx_queue->min_fill = fill_level; 324 rx_queue->min_fill = fill_level;
365 } 325 }
366 326
327 batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page;
367 space = rx_queue->max_fill - fill_level; 328 space = rx_queue->max_fill - fill_level;
368 EFX_BUG_ON_PARANOID(space < EFX_RX_BATCH); 329 EFX_BUG_ON_PARANOID(space < batch_size);
369 330
370 netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, 331 netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
371 "RX queue %d fast-filling descriptor ring from" 332 "RX queue %d fast-filling descriptor ring from"
372 " level %d to level %d using %s allocation\n", 333 " level %d to level %d\n",
373 efx_rx_queue_index(rx_queue), fill_level, 334 efx_rx_queue_index(rx_queue), fill_level,
374 rx_queue->max_fill, 335 rx_queue->max_fill);
375 channel->rx_alloc_push_pages ? "page" : "skb"); 336
376 337
377 do { 338 do {
378 if (channel->rx_alloc_push_pages) 339 rc = efx_init_rx_buffers(rx_queue);
379 rc = efx_init_rx_buffers_page(rx_queue);
380 else
381 rc = efx_init_rx_buffers_skb(rx_queue);
382 if (unlikely(rc)) { 340 if (unlikely(rc)) {
383 /* Ensure that we don't leave the rx queue empty */ 341 /* Ensure that we don't leave the rx queue empty */
384 if (rx_queue->added_count == rx_queue->removed_count) 342 if (rx_queue->added_count == rx_queue->removed_count)
385 efx_schedule_slow_fill(rx_queue); 343 efx_schedule_slow_fill(rx_queue);
386 goto out; 344 goto out;
387 } 345 }
388 } while ((space -= EFX_RX_BATCH) >= EFX_RX_BATCH); 346 } while ((space -= batch_size) >= batch_size);
389 347
390 netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, 348 netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
391 "RX queue %d fast-filled descriptor ring " 349 "RX queue %d fast-filled descriptor ring "
@@ -408,7 +366,7 @@ void efx_rx_slow_fill(unsigned long context)
408 366
409static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, 367static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
410 struct efx_rx_buffer *rx_buf, 368 struct efx_rx_buffer *rx_buf,
411 int len, bool *leak_packet) 369 int len)
412{ 370{
413 struct efx_nic *efx = rx_queue->efx; 371 struct efx_nic *efx = rx_queue->efx;
414 unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding; 372 unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding;
@@ -428,11 +386,6 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
428 "RX event (0x%x > 0x%x+0x%x). Leaking\n", 386 "RX event (0x%x > 0x%x+0x%x). Leaking\n",
429 efx_rx_queue_index(rx_queue), len, max_len, 387 efx_rx_queue_index(rx_queue), len, max_len,
430 efx->type->rx_buffer_padding); 388 efx->type->rx_buffer_padding);
431 /* If this buffer was skb-allocated, then the meta
432 * data at the end of the skb will be trashed. So
433 * we have no choice but to leak the fragment.
434 */
435 *leak_packet = !(rx_buf->flags & EFX_RX_BUF_PAGE);
436 efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY); 389 efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY);
437 } else { 390 } else {
438 if (net_ratelimit()) 391 if (net_ratelimit())
@@ -448,212 +401,238 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
448/* Pass a received packet up through GRO. GRO can handle pages 401/* Pass a received packet up through GRO. GRO can handle pages
449 * regardless of checksum state and skbs with a good checksum. 402 * regardless of checksum state and skbs with a good checksum.
450 */ 403 */
451static void efx_rx_packet_gro(struct efx_channel *channel, 404static void
452 struct efx_rx_buffer *rx_buf, 405efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
453 const u8 *eh) 406 unsigned int n_frags, u8 *eh)
454{ 407{
455 struct napi_struct *napi = &channel->napi_str; 408 struct napi_struct *napi = &channel->napi_str;
456 gro_result_t gro_result; 409 gro_result_t gro_result;
410 struct efx_nic *efx = channel->efx;
411 struct sk_buff *skb;
457 412
458 if (rx_buf->flags & EFX_RX_BUF_PAGE) { 413 skb = napi_get_frags(napi);
459 struct efx_nic *efx = channel->efx; 414 if (unlikely(!skb)) {
460 struct page *page = rx_buf->u.page; 415 while (n_frags--) {
461 struct sk_buff *skb; 416 put_page(rx_buf->page);
417 rx_buf->page = NULL;
418 rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
419 }
420 return;
421 }
462 422
463 rx_buf->u.page = NULL; 423 if (efx->net_dev->features & NETIF_F_RXHASH)
424 skb->rxhash = efx_rx_buf_hash(eh);
425 skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
426 CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
427
428 for (;;) {
429 skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
430 rx_buf->page, rx_buf->page_offset,
431 rx_buf->len);
432 rx_buf->page = NULL;
433 skb->len += rx_buf->len;
434 if (skb_shinfo(skb)->nr_frags == n_frags)
435 break;
436
437 rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
438 }
464 439
465 skb = napi_get_frags(napi); 440 skb->data_len = skb->len;
466 if (!skb) { 441 skb->truesize += n_frags * efx->rx_buffer_truesize;
467 put_page(page); 442
468 return; 443 skb_record_rx_queue(skb, channel->rx_queue.core_index);
469 } 444
445 gro_result = napi_gro_frags(napi);
446 if (gro_result != GRO_DROP)
447 channel->irq_mod_score += 2;
448}
470 449
471 if (efx->net_dev->features & NETIF_F_RXHASH) 450/* Allocate and construct an SKB around page fragments */
472 skb->rxhash = efx_rx_buf_hash(eh); 451static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel,
452 struct efx_rx_buffer *rx_buf,
453 unsigned int n_frags,
454 u8 *eh, int hdr_len)
455{
456 struct efx_nic *efx = channel->efx;
457 struct sk_buff *skb;
473 458
474 skb_fill_page_desc(skb, 0, page, 459 /* Allocate an SKB to store the headers */
475 efx_rx_buf_offset(efx, rx_buf), rx_buf->len); 460 skb = netdev_alloc_skb(efx->net_dev, hdr_len + EFX_PAGE_SKB_ALIGN);
461 if (unlikely(skb == NULL))
462 return NULL;
476 463
477 skb->len = rx_buf->len; 464 EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len);
478 skb->data_len = rx_buf->len;
479 skb->truesize += rx_buf->len;
480 skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
481 CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
482 465
483 skb_record_rx_queue(skb, channel->rx_queue.core_index); 466 skb_reserve(skb, EFX_PAGE_SKB_ALIGN);
467 memcpy(__skb_put(skb, hdr_len), eh, hdr_len);
484 468
485 gro_result = napi_gro_frags(napi); 469 /* Append the remaining page(s) onto the frag list */
486 } else { 470 if (rx_buf->len > hdr_len) {
487 struct sk_buff *skb = rx_buf->u.skb; 471 rx_buf->page_offset += hdr_len;
472 rx_buf->len -= hdr_len;
488 473
489 EFX_BUG_ON_PARANOID(!(rx_buf->flags & EFX_RX_PKT_CSUMMED)); 474 for (;;) {
490 rx_buf->u.skb = NULL; 475 skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
491 skb->ip_summed = CHECKSUM_UNNECESSARY; 476 rx_buf->page, rx_buf->page_offset,
477 rx_buf->len);
478 rx_buf->page = NULL;
479 skb->len += rx_buf->len;
480 skb->data_len += rx_buf->len;
481 if (skb_shinfo(skb)->nr_frags == n_frags)
482 break;
492 483
493 gro_result = napi_gro_receive(napi, skb); 484 rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
485 }
486 } else {
487 __free_pages(rx_buf->page, efx->rx_buffer_order);
488 rx_buf->page = NULL;
489 n_frags = 0;
494 } 490 }
495 491
496 if (gro_result == GRO_NORMAL) { 492 skb->truesize += n_frags * efx->rx_buffer_truesize;
497 channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; 493
498 } else if (gro_result != GRO_DROP) { 494 /* Move past the ethernet header */
499 channel->rx_alloc_level += RX_ALLOC_FACTOR_GRO; 495 skb->protocol = eth_type_trans(skb, efx->net_dev);
500 channel->irq_mod_score += 2; 496
501 } 497 return skb;
502} 498}
503 499
504void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, 500void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
505 unsigned int len, u16 flags) 501 unsigned int n_frags, unsigned int len, u16 flags)
506{ 502{
507 struct efx_nic *efx = rx_queue->efx; 503 struct efx_nic *efx = rx_queue->efx;
508 struct efx_channel *channel = efx_rx_queue_channel(rx_queue); 504 struct efx_channel *channel = efx_rx_queue_channel(rx_queue);
509 struct efx_rx_buffer *rx_buf; 505 struct efx_rx_buffer *rx_buf;
510 bool leak_packet = false;
511 506
512 rx_buf = efx_rx_buffer(rx_queue, index); 507 rx_buf = efx_rx_buffer(rx_queue, index);
513 rx_buf->flags |= flags; 508 rx_buf->flags |= flags;
514 509
515 /* This allows the refill path to post another buffer. 510 /* Validate the number of fragments and completed length */
516 * EFX_RXD_HEAD_ROOM ensures that the slot we are using 511 if (n_frags == 1) {
517 * isn't overwritten yet. 512 efx_rx_packet__check_len(rx_queue, rx_buf, len);
518 */ 513 } else if (unlikely(n_frags > EFX_RX_MAX_FRAGS) ||
519 rx_queue->removed_count++; 514 unlikely(len <= (n_frags - 1) * EFX_RX_USR_BUF_SIZE) ||
520 515 unlikely(len > n_frags * EFX_RX_USR_BUF_SIZE) ||
521 /* Validate the length encoded in the event vs the descriptor pushed */ 516 unlikely(!efx->rx_scatter)) {
522 efx_rx_packet__check_len(rx_queue, rx_buf, len, &leak_packet); 517 /* If this isn't an explicit discard request, either
518 * the hardware or the driver is broken.
519 */
520 WARN_ON(!(len == 0 && rx_buf->flags & EFX_RX_PKT_DISCARD));
521 rx_buf->flags |= EFX_RX_PKT_DISCARD;
522 }
523 523
524 netif_vdbg(efx, rx_status, efx->net_dev, 524 netif_vdbg(efx, rx_status, efx->net_dev,
525 "RX queue %d received id %x at %llx+%x %s%s\n", 525 "RX queue %d received ids %x-%x len %d %s%s\n",
526 efx_rx_queue_index(rx_queue), index, 526 efx_rx_queue_index(rx_queue), index,
527 (unsigned long long)rx_buf->dma_addr, len, 527 (index + n_frags - 1) & rx_queue->ptr_mask, len,
528 (rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "", 528 (rx_buf->flags & EFX_RX_PKT_CSUMMED) ? " [SUMMED]" : "",
529 (rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : ""); 529 (rx_buf->flags & EFX_RX_PKT_DISCARD) ? " [DISCARD]" : "");
530 530
531 /* Discard packet, if instructed to do so */ 531 /* Discard packet, if instructed to do so. Process the
532 * previous receive first.
533 */
532 if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) { 534 if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) {
533 if (unlikely(leak_packet)) 535 efx_rx_flush_packet(channel);
534 channel->n_skbuff_leaks++; 536 put_page(rx_buf->page);
535 else 537 efx_recycle_rx_buffers(channel, rx_buf, n_frags);
536 efx_recycle_rx_buffer(channel, rx_buf); 538 return;
537
538 /* Don't hold off the previous receive */
539 rx_buf = NULL;
540 goto out;
541 } 539 }
542 540
543 /* Release and/or sync DMA mapping - assumes all RX buffers 541 if (n_frags == 1)
544 * consumed in-order per RX queue 542 rx_buf->len = len;
543
544 /* Release and/or sync the DMA mapping - assumes all RX buffers
545 * consumed in-order per RX queue.
545 */ 546 */
546 efx_unmap_rx_buffer(efx, rx_buf, len); 547 efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
547 548
548 /* Prefetch nice and early so data will (hopefully) be in cache by 549 /* Prefetch nice and early so data will (hopefully) be in cache by
549 * the time we look at it. 550 * the time we look at it.
550 */ 551 */
551 prefetch(efx_rx_buf_eh(efx, rx_buf)); 552 prefetch(efx_rx_buf_va(rx_buf));
553
554 rx_buf->page_offset += efx->type->rx_buffer_hash_size;
555 rx_buf->len -= efx->type->rx_buffer_hash_size;
556
557 if (n_frags > 1) {
558 /* Release/sync DMA mapping for additional fragments.
559 * Fix length for last fragment.
560 */
561 unsigned int tail_frags = n_frags - 1;
562
563 for (;;) {
564 rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
565 if (--tail_frags == 0)
566 break;
567 efx_sync_rx_buffer(efx, rx_buf, EFX_RX_USR_BUF_SIZE);
568 }
569 rx_buf->len = len - (n_frags - 1) * EFX_RX_USR_BUF_SIZE;
570 efx_sync_rx_buffer(efx, rx_buf, rx_buf->len);
571 }
572
573 /* All fragments have been DMA-synced, so recycle buffers and pages. */
574 rx_buf = efx_rx_buffer(rx_queue, index);
575 efx_recycle_rx_buffers(channel, rx_buf, n_frags);
552 576
553 /* Pipeline receives so that we give time for packet headers to be 577 /* Pipeline receives so that we give time for packet headers to be
554 * prefetched into cache. 578 * prefetched into cache.
555 */ 579 */
556 rx_buf->len = len - efx->type->rx_buffer_hash_size; 580 efx_rx_flush_packet(channel);
557out: 581 channel->rx_pkt_n_frags = n_frags;
558 if (channel->rx_pkt) 582 channel->rx_pkt_index = index;
559 __efx_rx_packet(channel, channel->rx_pkt);
560 channel->rx_pkt = rx_buf;
561} 583}
562 584
563static void efx_rx_deliver(struct efx_channel *channel, 585static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
564 struct efx_rx_buffer *rx_buf) 586 struct efx_rx_buffer *rx_buf,
587 unsigned int n_frags)
565{ 588{
566 struct sk_buff *skb; 589 struct sk_buff *skb;
590 u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS);
567 591
568 /* We now own the SKB */ 592 skb = efx_rx_mk_skb(channel, rx_buf, n_frags, eh, hdr_len);
569 skb = rx_buf->u.skb; 593 if (unlikely(skb == NULL)) {
570 rx_buf->u.skb = NULL; 594 efx_free_rx_buffer(rx_buf);
595 return;
596 }
597 skb_record_rx_queue(skb, channel->rx_queue.core_index);
571 598
572 /* Set the SKB flags */ 599 /* Set the SKB flags */
573 skb_checksum_none_assert(skb); 600 skb_checksum_none_assert(skb);
574 601
575 /* Record the rx_queue */
576 skb_record_rx_queue(skb, channel->rx_queue.core_index);
577
578 /* Pass the packet up */
579 if (channel->type->receive_skb) 602 if (channel->type->receive_skb)
580 channel->type->receive_skb(channel, skb); 603 if (channel->type->receive_skb(channel, skb))
581 else 604 return;
582 netif_receive_skb(skb);
583 605
584 /* Update allocation strategy method */ 606 /* Pass the packet up */
585 channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; 607 netif_receive_skb(skb);
586} 608}
587 609
588/* Handle a received packet. Second half: Touches packet payload. */ 610/* Handle a received packet. Second half: Touches packet payload. */
589void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf) 611void __efx_rx_packet(struct efx_channel *channel)
590{ 612{
591 struct efx_nic *efx = channel->efx; 613 struct efx_nic *efx = channel->efx;
592 u8 *eh = efx_rx_buf_eh(efx, rx_buf); 614 struct efx_rx_buffer *rx_buf =
615 efx_rx_buffer(&channel->rx_queue, channel->rx_pkt_index);
616 u8 *eh = efx_rx_buf_va(rx_buf);
593 617
594 /* If we're in loopback test, then pass the packet directly to the 618 /* If we're in loopback test, then pass the packet directly to the
595 * loopback layer, and free the rx_buf here 619 * loopback layer, and free the rx_buf here
596 */ 620 */
597 if (unlikely(efx->loopback_selftest)) { 621 if (unlikely(efx->loopback_selftest)) {
598 efx_loopback_rx_packet(efx, eh, rx_buf->len); 622 efx_loopback_rx_packet(efx, eh, rx_buf->len);
599 efx_free_rx_buffer(efx, rx_buf); 623 efx_free_rx_buffer(rx_buf);
600 return; 624 goto out;
601 }
602
603 if (!(rx_buf->flags & EFX_RX_BUF_PAGE)) {
604 struct sk_buff *skb = rx_buf->u.skb;
605
606 prefetch(skb_shinfo(skb));
607
608 skb_reserve(skb, efx->type->rx_buffer_hash_size);
609 skb_put(skb, rx_buf->len);
610
611 if (efx->net_dev->features & NETIF_F_RXHASH)
612 skb->rxhash = efx_rx_buf_hash(eh);
613
614 /* Move past the ethernet header. rx_buf->data still points
615 * at the ethernet header */
616 skb->protocol = eth_type_trans(skb, efx->net_dev);
617
618 skb_record_rx_queue(skb, channel->rx_queue.core_index);
619 } 625 }
620 626
621 if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM))) 627 if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM)))
622 rx_buf->flags &= ~EFX_RX_PKT_CSUMMED; 628 rx_buf->flags &= ~EFX_RX_PKT_CSUMMED;
623 629
624 if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) && 630 if (!channel->type->receive_skb)
625 !channel->type->receive_skb) 631 efx_rx_packet_gro(channel, rx_buf, channel->rx_pkt_n_frags, eh);
626 efx_rx_packet_gro(channel, rx_buf, eh);
627 else 632 else
628 efx_rx_deliver(channel, rx_buf); 633 efx_rx_deliver(channel, eh, rx_buf, channel->rx_pkt_n_frags);
629} 634out:
630 635 channel->rx_pkt_n_frags = 0;
631void efx_rx_strategy(struct efx_channel *channel)
632{
633 enum efx_rx_alloc_method method = rx_alloc_method;
634
635 if (channel->type->receive_skb) {
636 channel->rx_alloc_push_pages = false;
637 return;
638 }
639
640 /* Only makes sense to use page based allocation if GRO is enabled */
641 if (!(channel->efx->net_dev->features & NETIF_F_GRO)) {
642 method = RX_ALLOC_METHOD_SKB;
643 } else if (method == RX_ALLOC_METHOD_AUTO) {
644 /* Constrain the rx_alloc_level */
645 if (channel->rx_alloc_level < 0)
646 channel->rx_alloc_level = 0;
647 else if (channel->rx_alloc_level > RX_ALLOC_LEVEL_MAX)
648 channel->rx_alloc_level = RX_ALLOC_LEVEL_MAX;
649
650 /* Decide on the allocation method */
651 method = ((channel->rx_alloc_level > RX_ALLOC_LEVEL_GRO) ?
652 RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB);
653 }
654
655 /* Push the option */
656 channel->rx_alloc_push_pages = (method == RX_ALLOC_METHOD_PAGE);
657} 636}
658 637
659int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) 638int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
@@ -683,9 +662,32 @@ int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
683 kfree(rx_queue->buffer); 662 kfree(rx_queue->buffer);
684 rx_queue->buffer = NULL; 663 rx_queue->buffer = NULL;
685 } 664 }
665
686 return rc; 666 return rc;
687} 667}
688 668
669void efx_init_rx_recycle_ring(struct efx_nic *efx,
670 struct efx_rx_queue *rx_queue)
671{
672 unsigned int bufs_in_recycle_ring, page_ring_size;
673
674 /* Set the RX recycle ring size */
675#ifdef CONFIG_PPC64
676 bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
677#else
678 if (efx->pci_dev->dev.iommu_group)
679 bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
680 else
681 bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU;
682#endif /* CONFIG_PPC64 */
683
684 page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring /
685 efx->rx_bufs_per_page);
686 rx_queue->page_ring = kcalloc(page_ring_size,
687 sizeof(*rx_queue->page_ring), GFP_KERNEL);
688 rx_queue->page_ptr_mask = page_ring_size - 1;
689}
690
689void efx_init_rx_queue(struct efx_rx_queue *rx_queue) 691void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
690{ 692{
691 struct efx_nic *efx = rx_queue->efx; 693 struct efx_nic *efx = rx_queue->efx;
@@ -699,10 +701,18 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
699 rx_queue->notified_count = 0; 701 rx_queue->notified_count = 0;
700 rx_queue->removed_count = 0; 702 rx_queue->removed_count = 0;
701 rx_queue->min_fill = -1U; 703 rx_queue->min_fill = -1U;
704 efx_init_rx_recycle_ring(efx, rx_queue);
705
706 rx_queue->page_remove = 0;
707 rx_queue->page_add = rx_queue->page_ptr_mask + 1;
708 rx_queue->page_recycle_count = 0;
709 rx_queue->page_recycle_failed = 0;
710 rx_queue->page_recycle_full = 0;
702 711
703 /* Initialise limit fields */ 712 /* Initialise limit fields */
704 max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM; 713 max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM;
705 max_trigger = max_fill - EFX_RX_BATCH; 714 max_trigger =
715 max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page;
706 if (rx_refill_threshold != 0) { 716 if (rx_refill_threshold != 0) {
707 trigger = max_fill * min(rx_refill_threshold, 100U) / 100U; 717 trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;
708 if (trigger > max_trigger) 718 if (trigger > max_trigger)
@@ -722,6 +732,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
722void efx_fini_rx_queue(struct efx_rx_queue *rx_queue) 732void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
723{ 733{
724 int i; 734 int i;
735 struct efx_nic *efx = rx_queue->efx;
725 struct efx_rx_buffer *rx_buf; 736 struct efx_rx_buffer *rx_buf;
726 737
727 netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev, 738 netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
@@ -733,13 +744,32 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
733 del_timer_sync(&rx_queue->slow_fill); 744 del_timer_sync(&rx_queue->slow_fill);
734 efx_nic_fini_rx(rx_queue); 745 efx_nic_fini_rx(rx_queue);
735 746
736 /* Release RX buffers NB start at index 0 not current HW ptr */ 747 /* Release RX buffers from the current read ptr to the write ptr */
737 if (rx_queue->buffer) { 748 if (rx_queue->buffer) {
738 for (i = 0; i <= rx_queue->ptr_mask; i++) { 749 for (i = rx_queue->removed_count; i < rx_queue->added_count;
739 rx_buf = efx_rx_buffer(rx_queue, i); 750 i++) {
751 unsigned index = i & rx_queue->ptr_mask;
752 rx_buf = efx_rx_buffer(rx_queue, index);
740 efx_fini_rx_buffer(rx_queue, rx_buf); 753 efx_fini_rx_buffer(rx_queue, rx_buf);
741 } 754 }
742 } 755 }
756
757 /* Unmap and release the pages in the recycle ring. Remove the ring. */
758 for (i = 0; i <= rx_queue->page_ptr_mask; i++) {
759 struct page *page = rx_queue->page_ring[i];
760 struct efx_rx_page_state *state;
761
762 if (page == NULL)
763 continue;
764
765 state = page_address(page);
766 dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
767 PAGE_SIZE << efx->rx_buffer_order,
768 DMA_FROM_DEVICE);
769 put_page(page);
770 }
771 kfree(rx_queue->page_ring);
772 rx_queue->page_ring = NULL;
743} 773}
744 774
745void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) 775void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
@@ -754,9 +784,6 @@ void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
754} 784}
755 785
756 786
757module_param(rx_alloc_method, int, 0644);
758MODULE_PARM_DESC(rx_alloc_method, "Allocation method used for RX buffers");
759
760module_param(rx_refill_threshold, uint, 0444); 787module_param(rx_refill_threshold, uint, 0444);
761MODULE_PARM_DESC(rx_refill_threshold, 788MODULE_PARM_DESC(rx_refill_threshold,
762 "RX descriptor ring refill threshold (%)"); 789 "RX descriptor ring refill threshold (%)");
diff --git a/drivers/net/ethernet/sfc/siena.c b/drivers/net/ethernet/sfc/siena.c
index ba40f67e4f05..51669244d154 100644
--- a/drivers/net/ethernet/sfc/siena.c
+++ b/drivers/net/ethernet/sfc/siena.c
@@ -202,7 +202,7 @@ out:
202 202
203static enum reset_type siena_map_reset_reason(enum reset_type reason) 203static enum reset_type siena_map_reset_reason(enum reset_type reason)
204{ 204{
205 return RESET_TYPE_ALL; 205 return RESET_TYPE_RECOVER_OR_ALL;
206} 206}
207 207
208static int siena_map_reset_flags(u32 *flags) 208static int siena_map_reset_flags(u32 *flags)
@@ -245,6 +245,22 @@ static int siena_reset_hw(struct efx_nic *efx, enum reset_type method)
245 return efx_mcdi_reset_port(efx); 245 return efx_mcdi_reset_port(efx);
246} 246}
247 247
248#ifdef CONFIG_EEH
249/* When a PCI device is isolated from the bus, a subsequent MMIO read is
250 * required for the kernel EEH mechanisms to notice. As the Solarflare driver
251 * was written to minimise MMIO read (for latency) then a periodic call to check
252 * the EEH status of the device is required so that device recovery can happen
253 * in a timely fashion.
254 */
255static void siena_monitor(struct efx_nic *efx)
256{
257 struct eeh_dev *eehdev =
258 of_node_to_eeh_dev(pci_device_to_OF_node(efx->pci_dev));
259
260 eeh_dev_check_failure(eehdev);
261}
262#endif
263
248static int siena_probe_nvconfig(struct efx_nic *efx) 264static int siena_probe_nvconfig(struct efx_nic *efx)
249{ 265{
250 u32 caps = 0; 266 u32 caps = 0;
@@ -398,6 +414,8 @@ static int siena_init_nic(struct efx_nic *efx)
398 EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_INSRT_HDR, 1); 414 EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_INSRT_HDR, 1);
399 EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_ALG, 1); 415 EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_HASH_ALG, 1);
400 EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_IP_HASH, 1); 416 EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_IP_HASH, 1);
417 EFX_SET_OWORD_FIELD(temp, FRF_BZ_RX_USR_BUF_SIZE,
418 EFX_RX_USR_BUF_SIZE >> 5);
401 efx_writeo(efx, &temp, FR_AZ_RX_CFG); 419 efx_writeo(efx, &temp, FR_AZ_RX_CFG);
402 420
403 /* Set hash key for IPv4 */ 421 /* Set hash key for IPv4 */
@@ -665,7 +683,11 @@ const struct efx_nic_type siena_a0_nic_type = {
665 .init = siena_init_nic, 683 .init = siena_init_nic,
666 .dimension_resources = siena_dimension_resources, 684 .dimension_resources = siena_dimension_resources,
667 .fini = efx_port_dummy_op_void, 685 .fini = efx_port_dummy_op_void,
686#ifdef CONFIG_EEH
687 .monitor = siena_monitor,
688#else
668 .monitor = NULL, 689 .monitor = NULL,
690#endif
669 .map_reset_reason = siena_map_reset_reason, 691 .map_reset_reason = siena_map_reset_reason,
670 .map_reset_flags = siena_map_reset_flags, 692 .map_reset_flags = siena_map_reset_flags,
671 .reset = siena_reset_hw, 693 .reset = siena_reset_hw,
@@ -698,6 +720,7 @@ const struct efx_nic_type siena_a0_nic_type = {
698 .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH), 720 .max_dma_mask = DMA_BIT_MASK(FSF_AZ_TX_KER_BUF_ADDR_WIDTH),
699 .rx_buffer_hash_size = 0x10, 721 .rx_buffer_hash_size = 0x10,
700 .rx_buffer_padding = 0, 722 .rx_buffer_padding = 0,
723 .can_rx_scatter = true,
701 .max_interrupt_mode = EFX_INT_MODE_MSIX, 724 .max_interrupt_mode = EFX_INT_MODE_MSIX,
702 .phys_addr_channels = 32, /* Hardware limit is 64, but the legacy 725 .phys_addr_channels = 32, /* Hardware limit is 64, but the legacy
703 * interrupt handler only supports 32 726 * interrupt handler only supports 32