aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRavinandan Arakali <Ravinandan.Arakali@neterion.com>2006-01-25 14:53:07 -0500
committerJeff Garzik <jgarzik@pobox.com>2006-01-27 10:34:38 -0500
commit7d3d0439f574a4857c97b3ad2e63b082b7382d7e (patch)
tree320e1024829ee4939f9d66d5bcddcdb43258cbcb
parentefd51b5c6798d103e3aa683464aebb2019b62119 (diff)
[PATCH] S2io: Large Receive Offload (LRO) feature(v2) for Neterion (s2io) 10GbE Xframe PCI-X and PCI-E NICs
Hi, Below is a patch for the Large Receive Offload feature. Please review and let us know your comments. LRO algorithm was described in an OLS 2005 presentation, located at ftp.s2io.com user: linuxdocs password: HALdocs The same ftp site has Programming Manual for Xframe-I ASIC. LRO feature is supported on Neterion Xframe-I, Xframe-II and Xframe-Express 10GbE NICs. Brief description: The Large Receive Offload(LRO) feature is a stateless offload that is complementary to TSO feature but on the receive path. The idea is to combine and collapse(upto 64K maximum) in the driver, in-sequence TCP packets belonging to the same session. It is mainly designed to improve 1500 mtu receive performance, since Jumbo frame performance is already close to 10GbE line rate. Some performance numbers are attached below. Implementation details: 1. Handle packet chains from multiple sessions(current default MAX_LRO_SESSSIONS=32). 2. Examine each packet for eligiblity to aggregate. A packet is considered eligible if it meets all the below criteria. a. It is a TCP/IP packet and L2 type is not LLC or SNAP. b. The packet has no checksum errors(L3 and L4). c. There are no IP options. The only TCP option supported is timestamps. d. Search and locate the LRO object corresponding to this socket and ensure packet is in TCP sequence. e. It's not a special packet(SYN, FIN, RST, URG, PSH etc. flags are not set). f. TCP payload is non-zero(It's not a pure ACK). g. It's not an IP-fragmented packet. 3. If a packet is found eligible, the LRO object is updated with information such as next sequence number expected, current length of aggregated packet and so on. If not eligible or max packets reached, update IP and TCP headers of first packet in the chain and pass it up to stack. 4. The frag_list in skb structure is used to chain packets into one large packet. Kernel changes required: None Performance results: Main focus of the initial testing was on 1500 mtu receiver, since this is a bottleneck not covered by the existing stateless offloads. There are couple disclaimers about the performance results below: 1. Your mileage will vary!!!! We initially concentrated on couple pci-x 2.0 platforms that are powerful enough to push 10 GbE NIC and do not have bottlenecks other than cpu%; testing on other platforms is still in progress. On some lower end systems we are seeing lower gains. 2. Current LRO implementation is still (for the most part) software based, and therefore performance potential of the feature is far from being realized. Full hw implementation of LRO is expected in the next version of Xframe ASIC. Performance delta(with MTU=1500) going from LRO disabled to enabled: IBM 2-way Xeon (x366) : 3.5 to 7.1 Gbps 2-way Opteron : 4.5 to 6.1 Gbps Signed-off-by: Ravinandan Arakali <ravinandan.arakali@neterion.com> Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
-rw-r--r--drivers/net/s2io.c586
-rw-r--r--drivers/net/s2io.h38
2 files changed, 562 insertions, 62 deletions
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 49b597cbc19a..4e392914971e 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -57,6 +57,9 @@
57#include <linux/ethtool.h> 57#include <linux/ethtool.h>
58#include <linux/workqueue.h> 58#include <linux/workqueue.h>
59#include <linux/if_vlan.h> 59#include <linux/if_vlan.h>
60#include <linux/ip.h>
61#include <linux/tcp.h>
62#include <net/tcp.h>
60 63
61#include <asm/system.h> 64#include <asm/system.h>
62#include <asm/uaccess.h> 65#include <asm/uaccess.h>
@@ -66,7 +69,7 @@
66#include "s2io.h" 69#include "s2io.h"
67#include "s2io-regs.h" 70#include "s2io-regs.h"
68 71
69#define DRV_VERSION "Version 2.0.9.4" 72#define DRV_VERSION "2.0.11.2"
70 73
71/* S2io Driver name & version. */ 74/* S2io Driver name & version. */
72static char s2io_driver_name[] = "Neterion"; 75static char s2io_driver_name[] = "Neterion";
@@ -168,6 +171,11 @@ static char ethtool_stats_keys[][ETH_GSTRING_LEN] = {
168 {"\n DRIVER STATISTICS"}, 171 {"\n DRIVER STATISTICS"},
169 {"single_bit_ecc_errs"}, 172 {"single_bit_ecc_errs"},
170 {"double_bit_ecc_errs"}, 173 {"double_bit_ecc_errs"},
174 ("lro_aggregated_pkts"),
175 ("lro_flush_both_count"),
176 ("lro_out_of_sequence_pkts"),
177 ("lro_flush_due_to_max_pkts"),
178 ("lro_avg_aggr_pkts"),
171}; 179};
172 180
173#define S2IO_STAT_LEN sizeof(ethtool_stats_keys)/ ETH_GSTRING_LEN 181#define S2IO_STAT_LEN sizeof(ethtool_stats_keys)/ ETH_GSTRING_LEN
@@ -317,6 +325,12 @@ static unsigned int indicate_max_pkts;
317static unsigned int rxsync_frequency = 3; 325static unsigned int rxsync_frequency = 3;
318/* Interrupt type. Values can be 0(INTA), 1(MSI), 2(MSI_X) */ 326/* Interrupt type. Values can be 0(INTA), 1(MSI), 2(MSI_X) */
319static unsigned int intr_type = 0; 327static unsigned int intr_type = 0;
328/* Large receive offload feature */
329static unsigned int lro = 0;
330/* Max pkts to be aggregated by LRO at one time. If not specified,
331 * aggregation happens until we hit max IP pkt size(64K)
332 */
333static unsigned int lro_max_pkts = 0xFFFF;
320 334
321/* 335/*
322 * S2IO device table. 336 * S2IO device table.
@@ -1476,6 +1490,19 @@ static int init_nic(struct s2io_nic *nic)
1476 writel((u32) (val64 >> 32), (add + 4)); 1490 writel((u32) (val64 >> 32), (add + 4));
1477 val64 = readq(&bar0->mac_cfg); 1491 val64 = readq(&bar0->mac_cfg);
1478 1492
1493 /* Enable FCS stripping by adapter */
1494 add = &bar0->mac_cfg;
1495 val64 = readq(&bar0->mac_cfg);
1496 val64 |= MAC_CFG_RMAC_STRIP_FCS;
1497 if (nic->device_type == XFRAME_II_DEVICE)
1498 writeq(val64, &bar0->mac_cfg);
1499 else {
1500 writeq(RMAC_CFG_KEY(0x4C0D), &bar0->rmac_cfg_key);
1501 writel((u32) (val64), add);
1502 writeq(RMAC_CFG_KEY(0x4C0D), &bar0->rmac_cfg_key);
1503 writel((u32) (val64 >> 32), (add + 4));
1504 }
1505
1479 /* 1506 /*
1480 * Set the time value to be inserted in the pause frame 1507 * Set the time value to be inserted in the pause frame
1481 * generated by xena. 1508 * generated by xena.
@@ -2569,6 +2596,8 @@ static void rx_intr_handler(ring_info_t *ring_data)
2569#ifndef CONFIG_S2IO_NAPI 2596#ifndef CONFIG_S2IO_NAPI
2570 int pkt_cnt = 0; 2597 int pkt_cnt = 0;
2571#endif 2598#endif
2599 int i;
2600
2572 spin_lock(&nic->rx_lock); 2601 spin_lock(&nic->rx_lock);
2573 if (atomic_read(&nic->card_state) == CARD_DOWN) { 2602 if (atomic_read(&nic->card_state) == CARD_DOWN) {
2574 DBG_PRINT(INTR_DBG, "%s: %s going down for reset\n", 2603 DBG_PRINT(INTR_DBG, "%s: %s going down for reset\n",
@@ -2661,6 +2690,18 @@ static void rx_intr_handler(ring_info_t *ring_data)
2661 break; 2690 break;
2662#endif 2691#endif
2663 } 2692 }
2693 if (nic->lro) {
2694 /* Clear all LRO sessions before exiting */
2695 for (i=0; i<MAX_LRO_SESSIONS; i++) {
2696 lro_t *lro = &nic->lro0_n[i];
2697 if (lro->in_use) {
2698 update_L3L4_header(nic, lro);
2699 queue_rx_frame(lro->parent);
2700 clear_lro_session(lro);
2701 }
2702 }
2703 }
2704
2664 spin_unlock(&nic->rx_lock); 2705 spin_unlock(&nic->rx_lock);
2665} 2706}
2666 2707
@@ -3668,23 +3709,32 @@ s2io_msi_handle(int irq, void *dev_id, struct pt_regs *regs)
3668 * else schedule a tasklet to reallocate the buffers. 3709 * else schedule a tasklet to reallocate the buffers.
3669 */ 3710 */
3670 for (i = 0; i < config->rx_ring_num; i++) { 3711 for (i = 0; i < config->rx_ring_num; i++) {
3671 int rxb_size = atomic_read(&sp->rx_bufs_left[i]); 3712 if (!sp->lro) {
3672 int level = rx_buffer_level(sp, rxb_size, i); 3713 int rxb_size = atomic_read(&sp->rx_bufs_left[i]);
3673 3714 int level = rx_buffer_level(sp, rxb_size, i);
3674 if ((level == PANIC) && (!TASKLET_IN_USE)) { 3715
3675 DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", dev->name); 3716 if ((level == PANIC) && (!TASKLET_IN_USE)) {
3676 DBG_PRINT(INTR_DBG, "PANIC levels\n"); 3717 DBG_PRINT(INTR_DBG, "%s: Rx BD hit ",
3677 if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) { 3718 dev->name);
3678 DBG_PRINT(ERR_DBG, "%s:Out of memory", 3719 DBG_PRINT(INTR_DBG, "PANIC levels\n");
3679 dev->name); 3720 if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) {
3680 DBG_PRINT(ERR_DBG, " in ISR!!\n"); 3721 DBG_PRINT(ERR_DBG, "%s:Out of memory",
3722 dev->name);
3723 DBG_PRINT(ERR_DBG, " in ISR!!\n");
3724 clear_bit(0, (&sp->tasklet_status));
3725 atomic_dec(&sp->isr_cnt);
3726 return IRQ_HANDLED;
3727 }
3681 clear_bit(0, (&sp->tasklet_status)); 3728 clear_bit(0, (&sp->tasklet_status));
3682 atomic_dec(&sp->isr_cnt); 3729 } else if (level == LOW) {
3683 return IRQ_HANDLED; 3730 tasklet_schedule(&sp->task);
3684 } 3731 }
3685 clear_bit(0, (&sp->tasklet_status)); 3732 }
3686 } else if (level == LOW) { 3733 else if (fill_rx_buffers(sp, i) == -ENOMEM) {
3687 tasklet_schedule(&sp->task); 3734 DBG_PRINT(ERR_DBG, "%s:Out of memory",
3735 dev->name);
3736 DBG_PRINT(ERR_DBG, " in Rx Intr!!\n");
3737 break;
3688 } 3738 }
3689 } 3739 }
3690 3740
@@ -3697,29 +3747,37 @@ s2io_msix_ring_handle(int irq, void *dev_id, struct pt_regs *regs)
3697{ 3747{
3698 ring_info_t *ring = (ring_info_t *)dev_id; 3748 ring_info_t *ring = (ring_info_t *)dev_id;
3699 nic_t *sp = ring->nic; 3749 nic_t *sp = ring->nic;
3750 struct net_device *dev = (struct net_device *) dev_id;
3700 int rxb_size, level, rng_n; 3751 int rxb_size, level, rng_n;
3701 3752
3702 atomic_inc(&sp->isr_cnt); 3753 atomic_inc(&sp->isr_cnt);
3703 rx_intr_handler(ring); 3754 rx_intr_handler(ring);
3704 3755
3705 rng_n = ring->ring_no; 3756 rng_n = ring->ring_no;
3706 rxb_size = atomic_read(&sp->rx_bufs_left[rng_n]); 3757 if (!sp->lro) {
3707 level = rx_buffer_level(sp, rxb_size, rng_n); 3758 rxb_size = atomic_read(&sp->rx_bufs_left[rng_n]);
3708 3759 level = rx_buffer_level(sp, rxb_size, rng_n);
3709 if ((level == PANIC) && (!TASKLET_IN_USE)) { 3760
3710 int ret; 3761 if ((level == PANIC) && (!TASKLET_IN_USE)) {
3711 DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", __FUNCTION__); 3762 int ret;
3712 DBG_PRINT(INTR_DBG, "PANIC levels\n"); 3763 DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", __FUNCTION__);
3713 if ((ret = fill_rx_buffers(sp, rng_n)) == -ENOMEM) { 3764 DBG_PRINT(INTR_DBG, "PANIC levels\n");
3714 DBG_PRINT(ERR_DBG, "Out of memory in %s", 3765 if ((ret = fill_rx_buffers(sp, rng_n)) == -ENOMEM) {
3715 __FUNCTION__); 3766 DBG_PRINT(ERR_DBG, "Out of memory in %s",
3767 __FUNCTION__);
3768 clear_bit(0, (&sp->tasklet_status));
3769 return IRQ_HANDLED;
3770 }
3716 clear_bit(0, (&sp->tasklet_status)); 3771 clear_bit(0, (&sp->tasklet_status));
3717 return IRQ_HANDLED; 3772 } else if (level == LOW) {
3773 tasklet_schedule(&sp->task);
3718 } 3774 }
3719 clear_bit(0, (&sp->tasklet_status));
3720 } else if (level == LOW) {
3721 tasklet_schedule(&sp->task);
3722 } 3775 }
3776 else if (fill_rx_buffers(sp, rng_n) == -ENOMEM) {
3777 DBG_PRINT(ERR_DBG, "%s:Out of memory", dev->name);
3778 DBG_PRINT(ERR_DBG, " in Rx Intr!!\n");
3779 }
3780
3723 atomic_dec(&sp->isr_cnt); 3781 atomic_dec(&sp->isr_cnt);
3724 3782
3725 return IRQ_HANDLED; 3783 return IRQ_HANDLED;
@@ -3875,24 +3933,33 @@ static irqreturn_t s2io_isr(int irq, void *dev_id, struct pt_regs *regs)
3875 */ 3933 */
3876#ifndef CONFIG_S2IO_NAPI 3934#ifndef CONFIG_S2IO_NAPI
3877 for (i = 0; i < config->rx_ring_num; i++) { 3935 for (i = 0; i < config->rx_ring_num; i++) {
3878 int ret; 3936 if (!sp->lro) {
3879 int rxb_size = atomic_read(&sp->rx_bufs_left[i]); 3937 int ret;
3880 int level = rx_buffer_level(sp, rxb_size, i); 3938 int rxb_size = atomic_read(&sp->rx_bufs_left[i]);
3881 3939 int level = rx_buffer_level(sp, rxb_size, i);
3882 if ((level == PANIC) && (!TASKLET_IN_USE)) { 3940
3883 DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", dev->name); 3941 if ((level == PANIC) && (!TASKLET_IN_USE)) {
3884 DBG_PRINT(INTR_DBG, "PANIC levels\n"); 3942 DBG_PRINT(INTR_DBG, "%s: Rx BD hit ",
3885 if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) { 3943 dev->name);
3886 DBG_PRINT(ERR_DBG, "%s:Out of memory", 3944 DBG_PRINT(INTR_DBG, "PANIC levels\n");
3887 dev->name); 3945 if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) {
3888 DBG_PRINT(ERR_DBG, " in ISR!!\n"); 3946 DBG_PRINT(ERR_DBG, "%s:Out of memory",
3947 dev->name);
3948 DBG_PRINT(ERR_DBG, " in ISR!!\n");
3949 clear_bit(0, (&sp->tasklet_status));
3950 atomic_dec(&sp->isr_cnt);
3951 return IRQ_HANDLED;
3952 }
3889 clear_bit(0, (&sp->tasklet_status)); 3953 clear_bit(0, (&sp->tasklet_status));
3890 atomic_dec(&sp->isr_cnt); 3954 } else if (level == LOW) {
3891 return IRQ_HANDLED; 3955 tasklet_schedule(&sp->task);
3892 } 3956 }
3893 clear_bit(0, (&sp->tasklet_status)); 3957 }
3894 } else if (level == LOW) { 3958 else if (fill_rx_buffers(sp, i) == -ENOMEM) {
3895 tasklet_schedule(&sp->task); 3959 DBG_PRINT(ERR_DBG, "%s:Out of memory",
3960 dev->name);
3961 DBG_PRINT(ERR_DBG, " in Rx intr!!\n");
3962 break;
3896 } 3963 }
3897 } 3964 }
3898#endif 3965#endif
@@ -5134,6 +5201,16 @@ static void s2io_get_ethtool_stats(struct net_device *dev,
5134 tmp_stats[i++] = 0; 5201 tmp_stats[i++] = 0;
5135 tmp_stats[i++] = stat_info->sw_stat.single_ecc_errs; 5202 tmp_stats[i++] = stat_info->sw_stat.single_ecc_errs;
5136 tmp_stats[i++] = stat_info->sw_stat.double_ecc_errs; 5203 tmp_stats[i++] = stat_info->sw_stat.double_ecc_errs;
5204 tmp_stats[i++] = stat_info->sw_stat.clubbed_frms_cnt;
5205 tmp_stats[i++] = stat_info->sw_stat.sending_both;
5206 tmp_stats[i++] = stat_info->sw_stat.outof_sequence_pkts;
5207 tmp_stats[i++] = stat_info->sw_stat.flush_max_pkts;
5208 if (stat_info->sw_stat.num_aggregations)
5209 tmp_stats[i++] = stat_info->sw_stat.sum_avg_pkts_aggregated /
5210 stat_info->sw_stat.num_aggregations;
5211 else
5212 tmp_stats[i++] = 0;
5213
5137} 5214}
5138 5215
5139static int s2io_ethtool_get_regs_len(struct net_device *dev) 5216static int s2io_ethtool_get_regs_len(struct net_device *dev)
@@ -5515,6 +5592,14 @@ static int s2io_card_up(nic_t * sp)
5515 /* Setting its receive mode */ 5592 /* Setting its receive mode */
5516 s2io_set_multicast(dev); 5593 s2io_set_multicast(dev);
5517 5594
5595 if (sp->lro) {
5596 /* Initialize max aggregatable pkts based on MTU */
5597 sp->lro_max_aggr_per_sess = ((1<<16) - 1) / dev->mtu;
5598 /* Check if we can use(if specified) user provided value */
5599 if (lro_max_pkts < sp->lro_max_aggr_per_sess)
5600 sp->lro_max_aggr_per_sess = lro_max_pkts;
5601 }
5602
5518 /* Enable tasklet for the device */ 5603 /* Enable tasklet for the device */
5519 tasklet_init(&sp->task, s2io_tasklet, (unsigned long) dev); 5604 tasklet_init(&sp->task, s2io_tasklet, (unsigned long) dev);
5520 5605
@@ -5607,6 +5692,7 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
5607 ((unsigned long) rxdp->Host_Control); 5692 ((unsigned long) rxdp->Host_Control);
5608 int ring_no = ring_data->ring_no; 5693 int ring_no = ring_data->ring_no;
5609 u16 l3_csum, l4_csum; 5694 u16 l3_csum, l4_csum;
5695 lro_t *lro;
5610 5696
5611 skb->dev = dev; 5697 skb->dev = dev;
5612 if (rxdp->Control_1 & RXD_T_CODE) { 5698 if (rxdp->Control_1 & RXD_T_CODE) {
@@ -5655,7 +5741,8 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
5655 skb_put(skb, buf2_len); 5741 skb_put(skb, buf2_len);
5656 } 5742 }
5657 5743
5658 if ((rxdp->Control_1 & TCP_OR_UDP_FRAME) && 5744 if ((rxdp->Control_1 & TCP_OR_UDP_FRAME) && ((!sp->lro) ||
5745 (sp->lro && (!(rxdp->Control_1 & RXD_FRAME_IP_FRAG)))) &&
5659 (sp->rx_csum)) { 5746 (sp->rx_csum)) {
5660 l3_csum = RXD_GET_L3_CKSUM(rxdp->Control_1); 5747 l3_csum = RXD_GET_L3_CKSUM(rxdp->Control_1);
5661 l4_csum = RXD_GET_L4_CKSUM(rxdp->Control_1); 5748 l4_csum = RXD_GET_L4_CKSUM(rxdp->Control_1);
@@ -5666,6 +5753,54 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
5666 * a flag in the RxD. 5753 * a flag in the RxD.
5667 */ 5754 */
5668 skb->ip_summed = CHECKSUM_UNNECESSARY; 5755 skb->ip_summed = CHECKSUM_UNNECESSARY;
5756 if (sp->lro) {
5757 u32 tcp_len;
5758 u8 *tcp;
5759 int ret = 0;
5760
5761 ret = s2io_club_tcp_session(skb->data, &tcp,
5762 &tcp_len, &lro, rxdp, sp);
5763 switch (ret) {
5764 case 3: /* Begin anew */
5765 lro->parent = skb;
5766 goto aggregate;
5767 case 1: /* Aggregate */
5768 {
5769 lro_append_pkt(sp, lro,
5770 skb, tcp_len);
5771 goto aggregate;
5772 }
5773 case 4: /* Flush session */
5774 {
5775 lro_append_pkt(sp, lro,
5776 skb, tcp_len);
5777 queue_rx_frame(lro->parent);
5778 clear_lro_session(lro);
5779 sp->mac_control.stats_info->
5780 sw_stat.flush_max_pkts++;
5781 goto aggregate;
5782 }
5783 case 2: /* Flush both */
5784 lro->parent->data_len =
5785 lro->frags_len;
5786 sp->mac_control.stats_info->
5787 sw_stat.sending_both++;
5788 queue_rx_frame(lro->parent);
5789 clear_lro_session(lro);
5790 goto send_up;
5791 case 0: /* sessions exceeded */
5792 case 5: /*
5793 * First pkt in session not
5794 * L3/L4 aggregatable
5795 */
5796 break;
5797 default:
5798 DBG_PRINT(ERR_DBG,
5799 "%s: Samadhana!!\n",
5800 __FUNCTION__);
5801 BUG();
5802 }
5803 }
5669 } else { 5804 } else {
5670 /* 5805 /*
5671 * Packet with erroneous checksum, let the 5806 * Packet with erroneous checksum, let the
@@ -5677,25 +5812,31 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
5677 skb->ip_summed = CHECKSUM_NONE; 5812 skb->ip_summed = CHECKSUM_NONE;
5678 } 5813 }
5679 5814
5680 skb->protocol = eth_type_trans(skb, dev); 5815 if (!sp->lro) {
5816 skb->protocol = eth_type_trans(skb, dev);
5681#ifdef CONFIG_S2IO_NAPI 5817#ifdef CONFIG_S2IO_NAPI
5682 if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) { 5818 if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) {
5683 /* Queueing the vlan frame to the upper layer */ 5819 /* Queueing the vlan frame to the upper layer */
5684 vlan_hwaccel_receive_skb(skb, sp->vlgrp, 5820 vlan_hwaccel_receive_skb(skb, sp->vlgrp,
5685 RXD_GET_VLAN_TAG(rxdp->Control_2)); 5821 RXD_GET_VLAN_TAG(rxdp->Control_2));
5686 } else { 5822 } else {
5687 netif_receive_skb(skb); 5823 netif_receive_skb(skb);
5688 } 5824 }
5689#else 5825#else
5690 if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) { 5826 if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) {
5691 /* Queueing the vlan frame to the upper layer */ 5827 /* Queueing the vlan frame to the upper layer */
5692 vlan_hwaccel_rx(skb, sp->vlgrp, 5828 vlan_hwaccel_rx(skb, sp->vlgrp,
5693 RXD_GET_VLAN_TAG(rxdp->Control_2)); 5829 RXD_GET_VLAN_TAG(rxdp->Control_2));
5694 } else { 5830 } else {
5695 netif_rx(skb); 5831 netif_rx(skb);
5696 } 5832 }
5697#endif 5833#endif
5834 } else {
5835send_up:
5836 queue_rx_frame(skb);
5837 }
5698 dev->last_rx = jiffies; 5838 dev->last_rx = jiffies;
5839aggregate:
5699 atomic_dec(&sp->rx_bufs_left[ring_no]); 5840 atomic_dec(&sp->rx_bufs_left[ring_no]);
5700 return SUCCESS; 5841 return SUCCESS;
5701} 5842}
@@ -5807,6 +5948,8 @@ module_param(indicate_max_pkts, int, 0);
5807#endif 5948#endif
5808module_param(rxsync_frequency, int, 0); 5949module_param(rxsync_frequency, int, 0);
5809module_param(intr_type, int, 0); 5950module_param(intr_type, int, 0);
5951module_param(lro, int, 0);
5952module_param(lro_max_pkts, int, 0);
5810 5953
5811/** 5954/**
5812 * s2io_init_nic - Initialization of the adapter . 5955 * s2io_init_nic - Initialization of the adapter .
@@ -5938,6 +6081,7 @@ Defaulting to INTA\n");
5938 else 6081 else
5939 sp->device_type = XFRAME_I_DEVICE; 6082 sp->device_type = XFRAME_I_DEVICE;
5940 6083
6084 sp->lro = lro;
5941 6085
5942 /* Initialize some PCI/PCI-X fields of the NIC. */ 6086 /* Initialize some PCI/PCI-X fields of the NIC. */
5943 s2io_init_pci(sp); 6087 s2io_init_pci(sp);
@@ -6241,6 +6385,10 @@ Defaulting to INTA\n");
6241 DBG_PRINT(ERR_DBG, "%s: 3-Buffer mode support has been " 6385 DBG_PRINT(ERR_DBG, "%s: 3-Buffer mode support has been "
6242 "enabled\n",dev->name); 6386 "enabled\n",dev->name);
6243 6387
6388 if (sp->lro)
6389 DBG_PRINT(ERR_DBG, "%s: Large receive offload enabled\n",
6390 dev->name);
6391
6244 /* Initialize device name */ 6392 /* Initialize device name */
6245 strcpy(sp->name, dev->name); 6393 strcpy(sp->name, dev->name);
6246 if (sp->device_type & XFRAME_II_DEVICE) 6394 if (sp->device_type & XFRAME_II_DEVICE)
@@ -6351,3 +6499,317 @@ void s2io_closer(void)
6351 6499
6352module_init(s2io_starter); 6500module_init(s2io_starter);
6353module_exit(s2io_closer); 6501module_exit(s2io_closer);
6502
6503static int check_L2_lro_capable(u8 *buffer, struct iphdr **ip,
6504 struct tcphdr **tcp, RxD_t *rxdp)
6505{
6506 int ip_off;
6507 u8 l2_type = (u8)((rxdp->Control_1 >> 37) & 0x7), ip_len;
6508
6509 if (!(rxdp->Control_1 & RXD_FRAME_PROTO_TCP)) {
6510 DBG_PRINT(INIT_DBG,"%s: Non-TCP frames not supported for LRO\n",
6511 __FUNCTION__);
6512 return -1;
6513 }
6514
6515 /* TODO:
6516 * By default the VLAN field in the MAC is stripped by the card, if this
6517 * feature is turned off in rx_pa_cfg register, then the ip_off field
6518 * has to be shifted by a further 2 bytes
6519 */
6520 switch (l2_type) {
6521 case 0: /* DIX type */
6522 case 4: /* DIX type with VLAN */
6523 ip_off = HEADER_ETHERNET_II_802_3_SIZE;
6524 break;
6525 /* LLC, SNAP etc are considered non-mergeable */
6526 default:
6527 return -1;
6528 }
6529
6530 *ip = (struct iphdr *)((u8 *)buffer + ip_off);
6531 ip_len = (u8)((*ip)->ihl);
6532 ip_len <<= 2;
6533 *tcp = (struct tcphdr *)((unsigned long)*ip + ip_len);
6534
6535 return 0;
6536}
6537
6538static int check_for_socket_match(lro_t *lro, struct iphdr *ip,
6539 struct tcphdr *tcp)
6540{
6541 DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
6542 if ((lro->iph->saddr != ip->saddr) || (lro->iph->daddr != ip->daddr) ||
6543 (lro->tcph->source != tcp->source) || (lro->tcph->dest != tcp->dest))
6544 return -1;
6545 return 0;
6546}
6547
6548static inline int get_l4_pyld_length(struct iphdr *ip, struct tcphdr *tcp)
6549{
6550 return(ntohs(ip->tot_len) - (ip->ihl << 2) - (tcp->doff << 2));
6551}
6552
6553static void initiate_new_session(lro_t *lro, u8 *l2h,
6554 struct iphdr *ip, struct tcphdr *tcp, u32 tcp_pyld_len)
6555{
6556 DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
6557 lro->l2h = l2h;
6558 lro->iph = ip;
6559 lro->tcph = tcp;
6560 lro->tcp_next_seq = tcp_pyld_len + ntohl(tcp->seq);
6561 lro->tcp_ack = ntohl(tcp->ack_seq);
6562 lro->sg_num = 1;
6563 lro->total_len = ntohs(ip->tot_len);
6564 lro->frags_len = 0;
6565 /*
6566 * check if we saw TCP timestamp. Other consistency checks have
6567 * already been done.
6568 */
6569 if (tcp->doff == 8) {
6570 u32 *ptr;
6571 ptr = (u32 *)(tcp+1);
6572 lro->saw_ts = 1;
6573 lro->cur_tsval = *(ptr+1);
6574 lro->cur_tsecr = *(ptr+2);
6575 }
6576 lro->in_use = 1;
6577}
6578
6579static void update_L3L4_header(nic_t *sp, lro_t *lro)
6580{
6581 struct iphdr *ip = lro->iph;
6582 struct tcphdr *tcp = lro->tcph;
6583 u16 nchk;
6584 StatInfo_t *statinfo = sp->mac_control.stats_info;
6585 DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
6586
6587 /* Update L3 header */
6588 ip->tot_len = htons(lro->total_len);
6589 ip->check = 0;
6590 nchk = ip_fast_csum((u8 *)lro->iph, ip->ihl);
6591 ip->check = nchk;
6592
6593 /* Update L4 header */
6594 tcp->ack_seq = lro->tcp_ack;
6595 tcp->window = lro->window;
6596
6597 /* Update tsecr field if this session has timestamps enabled */
6598 if (lro->saw_ts) {
6599 u32 *ptr = (u32 *)(tcp + 1);
6600 *(ptr+2) = lro->cur_tsecr;
6601 }
6602
6603 /* Update counters required for calculation of
6604 * average no. of packets aggregated.
6605 */
6606 statinfo->sw_stat.sum_avg_pkts_aggregated += lro->sg_num;
6607 statinfo->sw_stat.num_aggregations++;
6608}
6609
6610static void aggregate_new_rx(lro_t *lro, struct iphdr *ip,
6611 struct tcphdr *tcp, u32 l4_pyld)
6612{
6613 DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
6614 lro->total_len += l4_pyld;
6615 lro->frags_len += l4_pyld;
6616 lro->tcp_next_seq += l4_pyld;
6617 lro->sg_num++;
6618
6619 /* Update ack seq no. and window ad(from this pkt) in LRO object */
6620 lro->tcp_ack = tcp->ack_seq;
6621 lro->window = tcp->window;
6622
6623 if (lro->saw_ts) {
6624 u32 *ptr;
6625 /* Update tsecr and tsval from this packet */
6626 ptr = (u32 *) (tcp + 1);
6627 lro->cur_tsval = *(ptr + 1);
6628 lro->cur_tsecr = *(ptr + 2);
6629 }
6630}
6631
6632static int verify_l3_l4_lro_capable(lro_t *l_lro, struct iphdr *ip,
6633 struct tcphdr *tcp, u32 tcp_pyld_len)
6634{
6635 DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
6636 u8 *ptr;
6637
6638 if (!tcp_pyld_len) {
6639 /* Runt frame or a pure ack */
6640 return -1;
6641 }
6642
6643 if (ip->ihl != 5) /* IP has options */
6644 return -1;
6645
6646 if (tcp->urg || tcp->psh || tcp->rst || tcp->syn || tcp->fin ||
6647 !tcp->ack) {
6648 /*
6649 * Currently recognize only the ack control word and
6650 * any other control field being set would result in
6651 * flushing the LRO session
6652 */
6653 return -1;
6654 }
6655
6656 /*
6657 * Allow only one TCP timestamp option. Don't aggregate if
6658 * any other options are detected.
6659 */
6660 if (tcp->doff != 5 && tcp->doff != 8)
6661 return -1;
6662
6663 if (tcp->doff == 8) {
6664 ptr = (u8 *)(tcp + 1);
6665 while (*ptr == TCPOPT_NOP)
6666 ptr++;
6667 if (*ptr != TCPOPT_TIMESTAMP || *(ptr+1) != TCPOLEN_TIMESTAMP)
6668 return -1;
6669
6670 /* Ensure timestamp value increases monotonically */
6671 if (l_lro)
6672 if (l_lro->cur_tsval > *((u32 *)(ptr+2)))
6673 return -1;
6674
6675 /* timestamp echo reply should be non-zero */
6676 if (*((u32 *)(ptr+6)) == 0)
6677 return -1;
6678 }
6679
6680 return 0;
6681}
6682
6683static int
6684s2io_club_tcp_session(u8 *buffer, u8 **tcp, u32 *tcp_len, lro_t **lro,
6685 RxD_t *rxdp, nic_t *sp)
6686{
6687 struct iphdr *ip;
6688 struct tcphdr *tcph;
6689 int ret = 0, i;
6690
6691 if (!(ret = check_L2_lro_capable(buffer, &ip, (struct tcphdr **)tcp,
6692 rxdp))) {
6693 DBG_PRINT(INFO_DBG,"IP Saddr: %x Daddr: %x\n",
6694 ip->saddr, ip->daddr);
6695 } else {
6696 return ret;
6697 }
6698
6699 tcph = (struct tcphdr *)*tcp;
6700 *tcp_len = get_l4_pyld_length(ip, tcph);
6701 for (i=0; i<MAX_LRO_SESSIONS; i++) {
6702 lro_t *l_lro = &sp->lro0_n[i];
6703 if (l_lro->in_use) {
6704 if (check_for_socket_match(l_lro, ip, tcph))
6705 continue;
6706 /* Sock pair matched */
6707 *lro = l_lro;
6708
6709 if ((*lro)->tcp_next_seq != ntohl(tcph->seq)) {
6710 DBG_PRINT(INFO_DBG, "%s:Out of order. expected "
6711 "0x%x, actual 0x%x\n", __FUNCTION__,
6712 (*lro)->tcp_next_seq,
6713 ntohl(tcph->seq));
6714
6715 sp->mac_control.stats_info->
6716 sw_stat.outof_sequence_pkts++;
6717 ret = 2;
6718 break;
6719 }
6720
6721 if (!verify_l3_l4_lro_capable(l_lro, ip, tcph,*tcp_len))
6722 ret = 1; /* Aggregate */
6723 else
6724 ret = 2; /* Flush both */
6725 break;
6726 }
6727 }
6728
6729 if (ret == 0) {
6730 /* Before searching for available LRO objects,
6731 * check if the pkt is L3/L4 aggregatable. If not
6732 * don't create new LRO session. Just send this
6733 * packet up.
6734 */
6735 if (verify_l3_l4_lro_capable(NULL, ip, tcph, *tcp_len)) {
6736 return 5;
6737 }
6738
6739 for (i=0; i<MAX_LRO_SESSIONS; i++) {
6740 lro_t *l_lro = &sp->lro0_n[i];
6741 if (!(l_lro->in_use)) {
6742 *lro = l_lro;
6743 ret = 3; /* Begin anew */
6744 break;
6745 }
6746 }
6747 }
6748
6749 if (ret == 0) { /* sessions exceeded */
6750 DBG_PRINT(INFO_DBG,"%s:All LRO sessions already in use\n",
6751 __FUNCTION__);
6752 *lro = NULL;
6753 return ret;
6754 }
6755
6756 switch (ret) {
6757 case 3:
6758 initiate_new_session(*lro, buffer, ip, tcph, *tcp_len);
6759 break;
6760 case 2:
6761 update_L3L4_header(sp, *lro);
6762 break;
6763 case 1:
6764 aggregate_new_rx(*lro, ip, tcph, *tcp_len);
6765 if ((*lro)->sg_num == sp->lro_max_aggr_per_sess) {
6766 update_L3L4_header(sp, *lro);
6767 ret = 4; /* Flush the LRO */
6768 }
6769 break;
6770 default:
6771 DBG_PRINT(ERR_DBG,"%s:Dont know, can't say!!\n",
6772 __FUNCTION__);
6773 break;
6774 }
6775
6776 return ret;
6777}
6778
6779static void clear_lro_session(lro_t *lro)
6780{
6781 static u16 lro_struct_size = sizeof(lro_t);
6782
6783 memset(lro, 0, lro_struct_size);
6784}
6785
6786static void queue_rx_frame(struct sk_buff *skb)
6787{
6788 struct net_device *dev = skb->dev;
6789
6790 skb->protocol = eth_type_trans(skb, dev);
6791#ifdef CONFIG_S2IO_NAPI
6792 netif_receive_skb(skb);
6793#else
6794 netif_rx(skb);
6795#endif
6796}
6797
6798static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb,
6799 u32 tcp_len)
6800{
6801 struct sk_buff *tmp, *first = lro->parent;
6802
6803 first->len += tcp_len;
6804 first->data_len = lro->frags_len;
6805 skb_pull(skb, (skb->len - tcp_len));
6806 if ((tmp = skb_shinfo(first)->frag_list)) {
6807 while (tmp->next)
6808 tmp = tmp->next;
6809 tmp->next = skb;
6810 }
6811 else
6812 skb_shinfo(first)->frag_list = skb;
6813 sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
6814 return;
6815}
diff --git a/drivers/net/s2io.h b/drivers/net/s2io.h
index 852a6a899d07..65cc59ac71f0 100644
--- a/drivers/net/s2io.h
+++ b/drivers/net/s2io.h
@@ -78,6 +78,13 @@ int debug_level = ERR_DBG; /* Default level. */
78typedef struct { 78typedef struct {
79 unsigned long long single_ecc_errs; 79 unsigned long long single_ecc_errs;
80 unsigned long long double_ecc_errs; 80 unsigned long long double_ecc_errs;
81 /* LRO statistics */
82 unsigned long long clubbed_frms_cnt;
83 unsigned long long sending_both;
84 unsigned long long outof_sequence_pkts;
85 unsigned long long flush_max_pkts;
86 unsigned long long sum_avg_pkts_aggregated;
87 unsigned long long num_aggregations;
81} swStat_t; 88} swStat_t;
82 89
83/* The statistics block of Xena */ 90/* The statistics block of Xena */
@@ -680,6 +687,24 @@ struct msix_info_st {
680 u64 data; 687 u64 data;
681}; 688};
682 689
690/* Data structure to represent a LRO session */
691typedef struct lro {
692 struct sk_buff *parent;
693 u8 *l2h;
694 struct iphdr *iph;
695 struct tcphdr *tcph;
696 u32 tcp_next_seq;
697 u32 tcp_ack;
698 int total_len;
699 int frags_len;
700 int sg_num;
701 int in_use;
702 u16 window;
703 u32 cur_tsval;
704 u32 cur_tsecr;
705 u8 saw_ts;
706}lro_t;
707
683/* Structure representing one instance of the NIC */ 708/* Structure representing one instance of the NIC */
684struct s2io_nic { 709struct s2io_nic {
685 int rxd_mode; 710 int rxd_mode;
@@ -784,6 +809,13 @@ struct s2io_nic {
784#define XFRAME_II_DEVICE 2 809#define XFRAME_II_DEVICE 2
785 u8 device_type; 810 u8 device_type;
786 811
812#define MAX_LRO_SESSIONS 32
813 lro_t lro0_n[MAX_LRO_SESSIONS];
814 unsigned long clubbed_frms_cnt;
815 unsigned long sending_both;
816 u8 lro;
817 u16 lro_max_aggr_per_sess;
818
787#define INTA 0 819#define INTA 0
788#define MSI 1 820#define MSI 1
789#define MSI_X 2 821#define MSI_X 2
@@ -940,4 +972,10 @@ static void s2io_card_down(nic_t *nic);
940static int s2io_card_up(nic_t *nic); 972static int s2io_card_up(nic_t *nic);
941int get_xena_rev_id(struct pci_dev *pdev); 973int get_xena_rev_id(struct pci_dev *pdev);
942void restore_xmsi_data(nic_t *nic); 974void restore_xmsi_data(nic_t *nic);
975
976static int s2io_club_tcp_session(u8 *buffer, u8 **tcp, u32 *tcp_len, lro_t **lro, RxD_t *rxdp, nic_t *sp);
977static void clear_lro_session(lro_t *lro);
978static void queue_rx_frame(struct sk_buff *skb);
979static void update_L3L4_header(nic_t *sp, lro_t *lro);
980static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb, u32 tcp_len);
943#endif /* _S2IO_H */ 981#endif /* _S2IO_H */