aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWillem de Bruijn <willemb@google.com>2018-11-30 15:32:39 -0500
committerDavid S. Miller <davem@davemloft.net>2018-12-03 18:58:32 -0500
commitb5947e5d1e710c35ea281247bd27e6975250285c (patch)
tree5654233f622c8d5cd24b9396508d52d841d50296
parentce01a56ba3d9a56e9c7dd4662e2753b102a17d62 (diff)
udp: msg_zerocopy
Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and interpret flag MSG_ZEROCOPY. This patch was previously part of the zerocopy RFC patchsets. Zerocopy is not effective at small MTU. With segmentation offload building larger datagrams, the benefit of page flipping outweights the cost of generating a completion notification. tools/testing/selftests/net/msg_zerocopy.sh after applying follow-on test patch and making skb_orphan_frags_rx same as skb_orphan_frags: ipv4 udp -t 1 tx=191312 (11938 MB) txc=0 zc=n rx=191312 (11938 MB) ipv4 udp -z -t 1 tx=304507 (19002 MB) txc=304507 zc=y rx=304507 (19002 MB) ok ipv6 udp -t 1 tx=174485 (10888 MB) txc=0 zc=n rx=174485 (10888 MB) ipv6 udp -z -t 1 tx=294801 (18396 MB) txc=294801 zc=y rx=294801 (18396 MB) ok Changes v1 -> v2 - Fixup reverse christmas tree violation v2 -> v3 - Split refcount avoidance optimization into separate patch - Fix refcount leak on error in fragmented case (thanks to Paolo Abeni for pointing this one out!) - Fix refcount inc on zero - Test sock_flag SOCK_ZEROCOPY directly in __ip_append_data. This is needed since commit 5cf4a8532c99 ("tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY") did the same for tcp. Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/skbuff.h1
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/core/sock.c5
-rw-r--r--net/ipv4/ip_output.c23
-rw-r--r--net/ipv6/ip6_output.c23
5 files changed, 55 insertions, 3 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 73902acf2b71..04f52e719571 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -485,6 +485,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg);
485 485
486void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); 486void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
487 487
488int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
488int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 489int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
489 struct msghdr *msg, int len, 490 struct msghdr *msg, int len,
490 struct ubuf_info *uarg); 491 struct ubuf_info *uarg);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3c814565ed7c..1350901c5cb8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1105,6 +1105,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
1105extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, 1105extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
1106 struct iov_iter *from, size_t length); 1106 struct iov_iter *from, size_t length);
1107 1107
1108int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
1109{
1110 return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
1111}
1112EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
1113
1108int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1114int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
1109 struct msghdr *msg, int len, 1115 struct msghdr *msg, int len,
1110 struct ubuf_info *uarg) 1116 struct ubuf_info *uarg)
diff --git a/net/core/sock.c b/net/core/sock.c
index 6d7e189e3cd9..f5bb89785e47 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1018,7 +1018,10 @@ set_rcvbuf:
1018 1018
1019 case SO_ZEROCOPY: 1019 case SO_ZEROCOPY:
1020 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1020 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1021 if (sk->sk_protocol != IPPROTO_TCP) 1021 if (!((sk->sk_type == SOCK_STREAM &&
1022 sk->sk_protocol == IPPROTO_TCP) ||
1023 (sk->sk_type == SOCK_DGRAM &&
1024 sk->sk_protocol == IPPROTO_UDP)))
1022 ret = -ENOTSUPP; 1025 ret = -ENOTSUPP;
1023 } else if (sk->sk_family != PF_RDS) { 1026 } else if (sk->sk_family != PF_RDS) {
1024 ret = -ENOTSUPP; 1027 ret = -ENOTSUPP;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5dbec21856f4..6f843aff628c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
867 unsigned int flags) 867 unsigned int flags)
868{ 868{
869 struct inet_sock *inet = inet_sk(sk); 869 struct inet_sock *inet = inet_sk(sk);
870 struct ubuf_info *uarg = NULL;
870 struct sk_buff *skb; 871 struct sk_buff *skb;
871 872
872 struct ip_options *opt = cork->opt; 873 struct ip_options *opt = cork->opt;
@@ -916,6 +917,19 @@ static int __ip_append_data(struct sock *sk,
916 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) 917 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
917 csummode = CHECKSUM_PARTIAL; 918 csummode = CHECKSUM_PARTIAL;
918 919
920 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
921 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
922 if (!uarg)
923 return -ENOBUFS;
924 if (rt->dst.dev->features & NETIF_F_SG &&
925 csummode == CHECKSUM_PARTIAL) {
926 paged = true;
927 } else {
928 uarg->zerocopy = 0;
929 skb_zcopy_set(skb, uarg);
930 }
931 }
932
919 cork->length += length; 933 cork->length += length;
920 934
921 /* So, what's going on in the loop below? 935 /* So, what's going on in the loop below?
@@ -1006,6 +1020,7 @@ alloc_new_skb:
1006 cork->tx_flags = 0; 1020 cork->tx_flags = 0;
1007 skb_shinfo(skb)->tskey = tskey; 1021 skb_shinfo(skb)->tskey = tskey;
1008 tskey = 0; 1022 tskey = 0;
1023 skb_zcopy_set(skb, uarg);
1009 1024
1010 /* 1025 /*
1011 * Find where to start putting bytes. 1026 * Find where to start putting bytes.
@@ -1068,7 +1083,7 @@ alloc_new_skb:
1068 err = -EFAULT; 1083 err = -EFAULT;
1069 goto error; 1084 goto error;
1070 } 1085 }
1071 } else { 1086 } else if (!uarg || !uarg->zerocopy) {
1072 int i = skb_shinfo(skb)->nr_frags; 1087 int i = skb_shinfo(skb)->nr_frags;
1073 1088
1074 err = -ENOMEM; 1089 err = -ENOMEM;
@@ -1098,6 +1113,10 @@ alloc_new_skb:
1098 skb->data_len += copy; 1113 skb->data_len += copy;
1099 skb->truesize += copy; 1114 skb->truesize += copy;
1100 wmem_alloc_delta += copy; 1115 wmem_alloc_delta += copy;
1116 } else {
1117 err = skb_zerocopy_iter_dgram(skb, from, copy);
1118 if (err < 0)
1119 goto error;
1101 } 1120 }
1102 offset += copy; 1121 offset += copy;
1103 length -= copy; 1122 length -= copy;
@@ -1105,11 +1124,13 @@ alloc_new_skb:
1105 1124
1106 if (wmem_alloc_delta) 1125 if (wmem_alloc_delta)
1107 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1126 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1127 sock_zerocopy_put(uarg);
1108 return 0; 1128 return 0;
1109 1129
1110error_efault: 1130error_efault:
1111 err = -EFAULT; 1131 err = -EFAULT;
1112error: 1132error:
1133 sock_zerocopy_put_abort(uarg);
1113 cork->length -= length; 1134 cork->length -= length;
1114 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1135 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1115 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1136 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 827a3f5ff3bb..7df04d20a91f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk,
1245{ 1245{
1246 struct sk_buff *skb, *skb_prev = NULL; 1246 struct sk_buff *skb, *skb_prev = NULL;
1247 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1247 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1248 struct ubuf_info *uarg = NULL;
1248 int exthdrlen = 0; 1249 int exthdrlen = 0;
1249 int dst_exthdrlen = 0; 1250 int dst_exthdrlen = 0;
1250 int hh_len; 1251 int hh_len;
@@ -1322,6 +1323,19 @@ emsgsize:
1322 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1323 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1323 csummode = CHECKSUM_PARTIAL; 1324 csummode = CHECKSUM_PARTIAL;
1324 1325
1326 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1327 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1328 if (!uarg)
1329 return -ENOBUFS;
1330 if (rt->dst.dev->features & NETIF_F_SG &&
1331 csummode == CHECKSUM_PARTIAL) {
1332 paged = true;
1333 } else {
1334 uarg->zerocopy = 0;
1335 skb_zcopy_set(skb, uarg);
1336 }
1337 }
1338
1325 /* 1339 /*
1326 * Let's try using as much space as possible. 1340 * Let's try using as much space as possible.
1327 * Use MTU if total length of the message fits into the MTU. 1341 * Use MTU if total length of the message fits into the MTU.
@@ -1445,6 +1459,7 @@ alloc_new_skb:
1445 cork->tx_flags = 0; 1459 cork->tx_flags = 0;
1446 skb_shinfo(skb)->tskey = tskey; 1460 skb_shinfo(skb)->tskey = tskey;
1447 tskey = 0; 1461 tskey = 0;
1462 skb_zcopy_set(skb, uarg);
1448 1463
1449 /* 1464 /*
1450 * Find where to start putting bytes 1465 * Find where to start putting bytes
@@ -1506,7 +1521,7 @@ alloc_new_skb:
1506 err = -EFAULT; 1521 err = -EFAULT;
1507 goto error; 1522 goto error;
1508 } 1523 }
1509 } else { 1524 } else if (!uarg || !uarg->zerocopy) {
1510 int i = skb_shinfo(skb)->nr_frags; 1525 int i = skb_shinfo(skb)->nr_frags;
1511 1526
1512 err = -ENOMEM; 1527 err = -ENOMEM;
@@ -1536,6 +1551,10 @@ alloc_new_skb:
1536 skb->data_len += copy; 1551 skb->data_len += copy;
1537 skb->truesize += copy; 1552 skb->truesize += copy;
1538 wmem_alloc_delta += copy; 1553 wmem_alloc_delta += copy;
1554 } else {
1555 err = skb_zerocopy_iter_dgram(skb, from, copy);
1556 if (err < 0)
1557 goto error;
1539 } 1558 }
1540 offset += copy; 1559 offset += copy;
1541 length -= copy; 1560 length -= copy;
@@ -1543,11 +1562,13 @@ alloc_new_skb:
1543 1562
1544 if (wmem_alloc_delta) 1563 if (wmem_alloc_delta)
1545 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1564 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1565 sock_zerocopy_put(uarg);
1546 return 0; 1566 return 0;
1547 1567
1548error_efault: 1568error_efault:
1549 err = -EFAULT; 1569 err = -EFAULT;
1550error: 1570error:
1571 sock_zerocopy_put_abort(uarg);
1551 cork->length -= length; 1572 cork->length -= length;
1552 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1573 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1553 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1574 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);