aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/ibm/ibmveth.c
diff options
context:
space:
mode:
authorSivakumar Krishnasamy <ksiva@linux.vnet.ibm.com>2017-05-19 05:30:38 -0400
committerDavid S. Miller <davem@davemloft.net>2017-05-21 13:29:01 -0400
commit66aa0678efc29abd2ab02a09b23f9a8bc9f12a6c (patch)
tree12562d70fbd6efba4bde85eba36261b1e144f375 /drivers/net/ethernet/ibm/ibmveth.c
parent76e7f31d3f6d48dd4166e5df7f154280323bcb11 (diff)
ibmveth: Support to enable LSO/CSO for Trunk VEA.
Current largesend and checksum offload feature in ibmveth driver, - Source VM sends the TCP packets with ip_summed field set as CHECKSUM_PARTIAL and TCP pseudo header checksum is placed in checksum field - CHECKSUM_PARTIAL flag in SKB will enable ibmveth driver to mark "no checksum" and "checksum good" bits in transmit buffer descriptor before the packet is delivered to pseries PowerVM Hypervisor - If ibmveth has largesend capability enabled, transmit buffer descriptors are market accordingly before packet is delivered to Hypervisor (along with mss value for packets with length > MSS) - Destination VM's ibmveth driver receives the packet with "checksum good" bit set and so, SKB's ip_summed field is set with CHECKSUM_UNNECESSARY - If "largesend" bit was on, mss value is copied from receive descriptor into SKB's gso_size and other flags are appropriately set for packets > MSS size - The packet is now successfully delivered up the stack in destination VM The offloads described above works fine for TCP communication among VMs in the same pseries server ( VM A <=> PowerVM Hypervisor <=> VM B ) We are now enabling support for OVS in pseries PowerVM environment. One of our requirements is to have ibmveth driver configured in "Trunk" mode, when they are used with OVS. This is because, PowerVM Hypervisor will no more bridge the packets between VMs, instead the packets are delivered to IO Server which hosts OVS to bridge them between VMs or to external networks (flow shown below), VM A <=> PowerVM Hypervisor <=> IO Server(OVS) <=> PowerVM Hypervisor <=> VM B In "IO server" the packet is received by inbound Trunk ibmveth and then delivered to OVS, which is then bridged to outbound Trunk ibmveth (shown below), Inbound Trunk ibmveth <=> OVS <=> Outbound Trunk ibmveth In this model, we hit the following issues which impacted the VM communication performance, - Issue 1: ibmveth doesn't support largesend and checksum offload features when configured as "Trunk". Driver has explicit checks to prevent enabling these offloads. - Issue 2: SYN packet drops seen at destination VM. When the packet originates, it has CHECKSUM_PARTIAL flag set and as it gets delivered to IO server's inbound Trunk ibmveth, on validating "checksum good" bits in ibmveth receive routine, SKB's ip_summed field is set with CHECKSUM_UNNECESSARY flag. This packet is then bridged by OVS (or Linux Bridge) and delivered to outbound Trunk ibmveth. At this point the outbound ibmveth transmit routine will not set "no checksum" and "checksum good" bits in transmit buffer descriptor, as it does so only when the ip_summed field is CHECKSUM_PARTIAL. When this packet gets delivered to destination VM, TCP layer receives the packet with checksum value of 0 and with no checksum related flags in ip_summed field. This leads to packet drops. So, TCP connections never goes through fine. - Issue 3: First packet of a TCP connection will be dropped, if there is no OVS flow cached in datapath. OVS while trying to identify the flow, computes the checksum. The computed checksum will be invalid at the receiving end, as ibmveth transmit routine zeroes out the pseudo checksum value in the packet. This leads to packet drop. - Issue 4: ibmveth driver doesn't have support for SKB's with frag_list. When Physical NIC has GRO enabled and when OVS bridges these packets, OVS vport send code will end up calling dev_queue_xmit, which in turn calls validate_xmit_skb. In validate_xmit_skb routine, the larger packets will get segmented into MSS sized segments, if SKB has a frag_list and if the driver to which they are delivered to doesn't support NETIF_F_FRAGLIST feature. This patch addresses the above four issues, thereby enabling end to end largesend and checksum offload support for better performance. - Fix for Issue 1 : Remove checks which prevent enabling TCP largesend and checksum offloads. - Fix for Issue 2 : When ibmveth receives a packet with "checksum good" bit set and if its configured in Trunk mode, set appropriate SKB fields using skb_partial_csum_set (ip_summed field is set with CHECKSUM_PARTIAL) - Fix for Issue 3: Recompute the pseudo header checksum before sending the SKB up the stack. - Fix for Issue 4: Linearize the SKBs with frag_list. Though we end up allocating buffers and copying data, this fix gives upto 4X throughput increase. Note: All these fixes need to be dropped together as fixing just one of them will lead to other issues immediately (especially for Issues 1,2 & 3). Signed-off-by: Sivakumar Krishnasamy <ksiva@linux.vnet.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/ibm/ibmveth.c')
-rw-r--r--drivers/net/ethernet/ibm/ibmveth.c107
1 files changed, 89 insertions, 18 deletions
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 72ab7b6bf20b..9a74c4e2e193 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -46,6 +46,8 @@
46#include <asm/vio.h> 46#include <asm/vio.h>
47#include <asm/iommu.h> 47#include <asm/iommu.h>
48#include <asm/firmware.h> 48#include <asm/firmware.h>
49#include <net/tcp.h>
50#include <net/ip6_checksum.h>
49 51
50#include "ibmveth.h" 52#include "ibmveth.h"
51 53
@@ -808,8 +810,7 @@ static int ibmveth_set_csum_offload(struct net_device *dev, u32 data)
808 810
809 ret = h_illan_attributes(adapter->vdev->unit_address, 0, 0, &ret_attr); 811 ret = h_illan_attributes(adapter->vdev->unit_address, 0, 0, &ret_attr);
810 812
811 if (ret == H_SUCCESS && !(ret_attr & IBMVETH_ILLAN_ACTIVE_TRUNK) && 813 if (ret == H_SUCCESS &&
812 !(ret_attr & IBMVETH_ILLAN_TRUNK_PRI_MASK) &&
813 (ret_attr & IBMVETH_ILLAN_PADDED_PKT_CSUM)) { 814 (ret_attr & IBMVETH_ILLAN_PADDED_PKT_CSUM)) {
814 ret4 = h_illan_attributes(adapter->vdev->unit_address, clr_attr, 815 ret4 = h_illan_attributes(adapter->vdev->unit_address, clr_attr,
815 set_attr, &ret_attr); 816 set_attr, &ret_attr);
@@ -1040,6 +1041,15 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb,
1040 dma_addr_t dma_addr; 1041 dma_addr_t dma_addr;
1041 unsigned long mss = 0; 1042 unsigned long mss = 0;
1042 1043
1044 /* veth doesn't handle frag_list, so linearize the skb.
1045 * When GRO is enabled SKB's can have frag_list.
1046 */
1047 if (adapter->is_active_trunk &&
1048 skb_has_frag_list(skb) && __skb_linearize(skb)) {
1049 netdev->stats.tx_dropped++;
1050 goto out;
1051 }
1052
1043 /* 1053 /*
1044 * veth handles a maximum of 6 segments including the header, so 1054 * veth handles a maximum of 6 segments including the header, so
1045 * we have to linearize the skb if there are more than this. 1055 * we have to linearize the skb if there are more than this.
@@ -1064,9 +1074,6 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb,
1064 1074
1065 desc_flags = IBMVETH_BUF_VALID; 1075 desc_flags = IBMVETH_BUF_VALID;
1066 1076
1067 if (skb_is_gso(skb) && adapter->fw_large_send_support)
1068 desc_flags |= IBMVETH_BUF_LRG_SND;
1069
1070 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1077 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1071 unsigned char *buf = skb_transport_header(skb) + 1078 unsigned char *buf = skb_transport_header(skb) +
1072 skb->csum_offset; 1079 skb->csum_offset;
@@ -1076,6 +1083,9 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb,
1076 /* Need to zero out the checksum */ 1083 /* Need to zero out the checksum */
1077 buf[0] = 0; 1084 buf[0] = 0;
1078 buf[1] = 0; 1085 buf[1] = 0;
1086
1087 if (skb_is_gso(skb) && adapter->fw_large_send_support)
1088 desc_flags |= IBMVETH_BUF_LRG_SND;
1079 } 1089 }
1080 1090
1081retry_bounce: 1091retry_bounce:
@@ -1128,7 +1138,7 @@ retry_bounce:
1128 descs[i+1].fields.address = dma_addr; 1138 descs[i+1].fields.address = dma_addr;
1129 } 1139 }
1130 1140
1131 if (skb_is_gso(skb)) { 1141 if (skb->ip_summed == CHECKSUM_PARTIAL && skb_is_gso(skb)) {
1132 if (adapter->fw_large_send_support) { 1142 if (adapter->fw_large_send_support) {
1133 mss = (unsigned long)skb_shinfo(skb)->gso_size; 1143 mss = (unsigned long)skb_shinfo(skb)->gso_size;
1134 adapter->tx_large_packets++; 1144 adapter->tx_large_packets++;
@@ -1232,6 +1242,71 @@ static void ibmveth_rx_mss_helper(struct sk_buff *skb, u16 mss, int lrg_pkt)
1232 } 1242 }
1233} 1243}
1234 1244
1245static void ibmveth_rx_csum_helper(struct sk_buff *skb,
1246 struct ibmveth_adapter *adapter)
1247{
1248 struct iphdr *iph = NULL;
1249 struct ipv6hdr *iph6 = NULL;
1250 __be16 skb_proto = 0;
1251 u16 iphlen = 0;
1252 u16 iph_proto = 0;
1253 u16 tcphdrlen = 0;
1254
1255 skb_proto = be16_to_cpu(skb->protocol);
1256
1257 if (skb_proto == ETH_P_IP) {
1258 iph = (struct iphdr *)skb->data;
1259
1260 /* If the IP checksum is not offloaded and if the packet
1261 * is large send, the checksum must be rebuilt.
1262 */
1263 if (iph->check == 0xffff) {
1264 iph->check = 0;
1265 iph->check = ip_fast_csum((unsigned char *)iph,
1266 iph->ihl);
1267 }
1268
1269 iphlen = iph->ihl * 4;
1270 iph_proto = iph->protocol;
1271 } else if (skb_proto == ETH_P_IPV6) {
1272 iph6 = (struct ipv6hdr *)skb->data;
1273 iphlen = sizeof(struct ipv6hdr);
1274 iph_proto = iph6->nexthdr;
1275 }
1276
1277 /* In OVS environment, when a flow is not cached, specifically for a
1278 * new TCP connection, the first packet information is passed up
1279 * the user space for finding a flow. During this process, OVS computes
1280 * checksum on the first packet when CHECKSUM_PARTIAL flag is set.
1281 *
1282 * Given that we zeroed out TCP checksum field in transmit path
1283 * (refer ibmveth_start_xmit routine) as we set "no checksum bit",
1284 * OVS computed checksum will be incorrect w/o TCP pseudo checksum
1285 * in the packet. This leads to OVS dropping the packet and hence
1286 * TCP retransmissions are seen.
1287 *
1288 * So, re-compute TCP pseudo header checksum.
1289 */
1290 if (iph_proto == IPPROTO_TCP && adapter->is_active_trunk) {
1291 struct tcphdr *tcph = (struct tcphdr *)(skb->data + iphlen);
1292
1293 tcphdrlen = skb->len - iphlen;
1294
1295 /* Recompute TCP pseudo header checksum */
1296 if (skb_proto == ETH_P_IP)
1297 tcph->check = ~csum_tcpudp_magic(iph->saddr,
1298 iph->daddr, tcphdrlen, iph_proto, 0);
1299 else if (skb_proto == ETH_P_IPV6)
1300 tcph->check = ~csum_ipv6_magic(&iph6->saddr,
1301 &iph6->daddr, tcphdrlen, iph_proto, 0);
1302
1303 /* Setup SKB fields for checksum offload */
1304 skb_partial_csum_set(skb, iphlen,
1305 offsetof(struct tcphdr, check));
1306 skb_reset_network_header(skb);
1307 }
1308}
1309
1235static int ibmveth_poll(struct napi_struct *napi, int budget) 1310static int ibmveth_poll(struct napi_struct *napi, int budget)
1236{ 1311{
1237 struct ibmveth_adapter *adapter = 1312 struct ibmveth_adapter *adapter =
@@ -1239,7 +1314,6 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
1239 struct net_device *netdev = adapter->netdev; 1314 struct net_device *netdev = adapter->netdev;
1240 int frames_processed = 0; 1315 int frames_processed = 0;
1241 unsigned long lpar_rc; 1316 unsigned long lpar_rc;
1242 struct iphdr *iph;
1243 u16 mss = 0; 1317 u16 mss = 0;
1244 1318
1245restart_poll: 1319restart_poll:
@@ -1297,17 +1371,7 @@ restart_poll:
1297 1371
1298 if (csum_good) { 1372 if (csum_good) {
1299 skb->ip_summed = CHECKSUM_UNNECESSARY; 1373 skb->ip_summed = CHECKSUM_UNNECESSARY;
1300 if (be16_to_cpu(skb->protocol) == ETH_P_IP) { 1374 ibmveth_rx_csum_helper(skb, adapter);
1301 iph = (struct iphdr *)skb->data;
1302
1303 /* If the IP checksum is not offloaded and if the packet
1304 * is large send, the checksum must be rebuilt.
1305 */
1306 if (iph->check == 0xffff) {
1307 iph->check = 0;
1308 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
1309 }
1310 }
1311 } 1375 }
1312 1376
1313 if (length > netdev->mtu + ETH_HLEN) { 1377 if (length > netdev->mtu + ETH_HLEN) {
@@ -1626,6 +1690,13 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
1626 netdev->hw_features |= NETIF_F_TSO; 1690 netdev->hw_features |= NETIF_F_TSO;
1627 } 1691 }
1628 1692
1693 adapter->is_active_trunk = false;
1694 if (ret == H_SUCCESS && (ret_attr & IBMVETH_ILLAN_ACTIVE_TRUNK)) {
1695 adapter->is_active_trunk = true;
1696 netdev->hw_features |= NETIF_F_FRAGLIST;
1697 netdev->features |= NETIF_F_FRAGLIST;
1698 }
1699
1629 netdev->min_mtu = IBMVETH_MIN_MTU; 1700 netdev->min_mtu = IBMVETH_MIN_MTU;
1630 netdev->max_mtu = ETH_MAX_MTU; 1701 netdev->max_mtu = ETH_MAX_MTU;
1631 1702