summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWillem de Bruijn <willemb@google.com>2018-04-26 13:42:17 -0400
committerDavid S. Miller <davem@davemloft.net>2018-04-26 15:08:04 -0400
commitbec1f6f697362c5bc635dacd7ac8499d0a10a4e7 (patch)
tree90f8dcb39e9c7b62034c8010e054cc237940339a
parentee80d1ebe5ba7f4bd74959c873119175a4fc08d3 (diff)
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can concatenate and send at once the payload of multiple datagrams with the same destination. To set segment size, the caller sets socket option UDP_SEGMENT to the length of each discrete payload. This value must be smaller than or equal to the relevant MTU. A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a per send call basis. Total byte length may then exceed MTU. If not an exact multiple of segment size, the last segment will be shorter. The implementation adds a gso_size field to the udp socket, ip(v6) cmsg cookie and inet_cork structure to be able to set the value at setsockopt or cmsg time and to work with both lockless and corked paths. Initial benchmark numbers show UDP GSO about as expensive as TCP GSO. tcp tso 3197 MB/s 54232 msg/s 54232 calls/s 6,457,754,262 cycles tcp gso 1765 MB/s 29939 msg/s 29939 calls/s 11,203,021,806 cycles tcp without tso/gso * 739 MB/s 12548 msg/s 12548 calls/s 11,205,483,630 cycles udp 876 MB/s 14873 msg/s 624666 calls/s 11,205,777,429 cycles udp gso 2139 MB/s 36282 msg/s 36282 calls/s 11,204,374,561 cycles [*] after reverting commit 0a6b2a1dc2a2 ("tcp: switch to GSO being always on") Measured total system cycles ('-a') for one core while pinning both the network receive path and benchmark process to that core: perf stat -a -C 12 -e cycles \ ./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4 Note the reduction in calls/s with GSO. Bytes per syscall drops increases from 1470 to 61818. Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/udp.h3
-rw-r--r--include/net/inet_sock.h1
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/ipv6.h1
-rw-r--r--include/uapi/linux/udp.h1
-rw-r--r--net/ipv4/ip_output.c9
-rw-r--r--net/ipv4/udp.c33
-rw-r--r--net/ipv6/ip6_output.c6
-rw-r--r--net/ipv6/udp.c23
9 files changed, 67 insertions, 11 deletions
diff --git a/include/linux/udp.h b/include/linux/udp.h
index eaea63bc79bb..ca840345571b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,6 +55,7 @@ struct udp_sock {
55 * when the socket is uncorked. 55 * when the socket is uncorked.
56 */ 56 */
57 __u16 len; /* total length of pending frames */ 57 __u16 len; /* total length of pending frames */
58 __u16 gso_size;
58 /* 59 /*
59 * Fields specific to UDP-Lite. 60 * Fields specific to UDP-Lite.
60 */ 61 */
@@ -87,6 +88,8 @@ struct udp_sock {
87 int forward_deficit; 88 int forward_deficit;
88}; 89};
89 90
91#define UDP_MAX_SEGMENTS (1 << 6UL)
92
90static inline struct udp_sock *udp_sk(const struct sock *sk) 93static inline struct udp_sock *udp_sk(const struct sock *sk)
91{ 94{
92 return (struct udp_sock *)sk; 95 return (struct udp_sock *)sk;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..83d5b3c2ac42 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -147,6 +147,7 @@ struct inet_cork {
147 __u8 ttl; 147 __u8 ttl;
148 __s16 tos; 148 __s16 tos;
149 char priority; 149 char priority;
150 __u16 gso_size;
150}; 151};
151 152
152struct inet_cork_full { 153struct inet_cork_full {
diff --git a/include/net/ip.h b/include/net/ip.h
index 7ec543a64bbc..bada1f1f871e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -76,6 +76,7 @@ struct ipcm_cookie {
76 __u8 ttl; 76 __u8 ttl;
77 __s16 tos; 77 __s16 tos;
78 char priority; 78 char priority;
79 __u16 gso_size;
79}; 80};
80 81
81#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) 82#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0dd722cab037..0a872a7c33c8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -298,6 +298,7 @@ struct ipcm6_cookie {
298 __s16 tclass; 298 __s16 tclass;
299 __s8 dontfrag; 299 __s8 dontfrag;
300 struct ipv6_txoptions *opt; 300 struct ipv6_txoptions *opt;
301 __u16 gso_size;
301}; 302};
302 303
303static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) 304static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index efb7b5991c2f..09d00f8c442b 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -32,6 +32,7 @@ struct udphdr {
32#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ 32#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
33#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ 33#define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */
34#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */ 34#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */
35#define UDP_SEGMENT 103 /* Set GSO segmentation size */
35 36
36/* UDP encapsulation types */ 37/* UDP encapsulation types */
37#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ 38#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2883ff1e909c..da4abbee10f7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,7 +882,8 @@ static int __ip_append_data(struct sock *sk,
882 skb = skb_peek_tail(queue); 882 skb = skb_peek_tail(queue);
883 883
884 exthdrlen = !skb ? rt->dst.header_len : 0; 884 exthdrlen = !skb ? rt->dst.header_len : 0;
885 mtu = cork->fragsize; 885 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
886
886 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 887 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
887 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 888 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
888 tskey = sk->sk_tskey++; 889 tskey = sk->sk_tskey++;
@@ -906,7 +907,7 @@ static int __ip_append_data(struct sock *sk,
906 if (transhdrlen && 907 if (transhdrlen &&
907 length + fragheaderlen <= mtu && 908 length + fragheaderlen <= mtu &&
908 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && 909 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
909 !(flags & MSG_MORE) && 910 (!(flags & MSG_MORE) || cork->gso_size) &&
910 !exthdrlen) 911 !exthdrlen)
911 csummode = CHECKSUM_PARTIAL; 912 csummode = CHECKSUM_PARTIAL;
912 913
@@ -1135,6 +1136,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1135 *rtp = NULL; 1136 *rtp = NULL;
1136 cork->fragsize = ip_sk_use_pmtu(sk) ? 1137 cork->fragsize = ip_sk_use_pmtu(sk) ?
1137 dst_mtu(&rt->dst) : rt->dst.dev->mtu; 1138 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1139
1140 cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
1138 cork->dst = &rt->dst; 1141 cork->dst = &rt->dst;
1139 cork->length = 0; 1142 cork->length = 0;
1140 cork->ttl = ipc->ttl; 1143 cork->ttl = ipc->ttl;
@@ -1214,7 +1217,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1214 return -EOPNOTSUPP; 1217 return -EOPNOTSUPP;
1215 1218
1216 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1219 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1217 mtu = cork->fragsize; 1220 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
1218 1221
1219 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1222 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1220 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1223 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6b9d8017b319..bda022c5480b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
757} 757}
758EXPORT_SYMBOL(udp_set_csum); 758EXPORT_SYMBOL(udp_set_csum);
759 759
760static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) 760static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
761 struct inet_cork *cork)
761{ 762{
762 struct sock *sk = skb->sk; 763 struct sock *sk = skb->sk;
763 struct inet_sock *inet = inet_sk(sk); 764 struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +778,21 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
777 uh->len = htons(len); 778 uh->len = htons(len);
778 uh->check = 0; 779 uh->check = 0;
779 780
781 if (cork->gso_size) {
782 const int hlen = skb_network_header_len(skb) +
783 sizeof(struct udphdr);
784
785 if (hlen + cork->gso_size > cork->fragsize)
786 return -EINVAL;
787 if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
788 return -EINVAL;
789 if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
790 return -EIO;
791
792 skb_shinfo(skb)->gso_size = cork->gso_size;
793 skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
794 }
795
780 if (is_udplite) /* UDP-Lite */ 796 if (is_udplite) /* UDP-Lite */
781 csum = udplite_csum(skb); 797 csum = udplite_csum(skb);
782 798
@@ -828,7 +844,7 @@ int udp_push_pending_frames(struct sock *sk)
828 if (!skb) 844 if (!skb)
829 goto out; 845 goto out;
830 846
831 err = udp_send_skb(skb, fl4); 847 err = udp_send_skb(skb, fl4, &inet->cork.base);
832 848
833out: 849out:
834 up->len = 0; 850 up->len = 0;
@@ -922,6 +938,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
922 ipc.sockc.tsflags = sk->sk_tsflags; 938 ipc.sockc.tsflags = sk->sk_tsflags;
923 ipc.addr = inet->inet_saddr; 939 ipc.addr = inet->inet_saddr;
924 ipc.oif = sk->sk_bound_dev_if; 940 ipc.oif = sk->sk_bound_dev_if;
941 ipc.gso_size = up->gso_size;
925 942
926 if (msg->msg_controllen) { 943 if (msg->msg_controllen) {
927 err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); 944 err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
@@ -1037,7 +1054,7 @@ back_from_confirm:
1037 &cork, msg->msg_flags); 1054 &cork, msg->msg_flags);
1038 err = PTR_ERR(skb); 1055 err = PTR_ERR(skb);
1039 if (!IS_ERR_OR_NULL(skb)) 1056 if (!IS_ERR_OR_NULL(skb))
1040 err = udp_send_skb(skb, fl4); 1057 err = udp_send_skb(skb, fl4, &cork);
1041 goto out; 1058 goto out;
1042 } 1059 }
1043 1060
@@ -2367,6 +2384,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
2367 up->no_check6_rx = valbool; 2384 up->no_check6_rx = valbool;
2368 break; 2385 break;
2369 2386
2387 case UDP_SEGMENT:
2388 if (val < 0 || val > USHRT_MAX)
2389 return -EINVAL;
2390 up->gso_size = val;
2391 break;
2392
2370 /* 2393 /*
2371 * UDP-Lite's partial checksum coverage (RFC 3828). 2394 * UDP-Lite's partial checksum coverage (RFC 3828).
2372 */ 2395 */
@@ -2457,6 +2480,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
2457 val = up->no_check6_rx; 2480 val = up->no_check6_rx;
2458 break; 2481 break;
2459 2482
2483 case UDP_SEGMENT:
2484 val = up->gso_size;
2485 break;
2486
2460 /* The following two cannot be changed on UDP sockets, the return is 2487 /* The following two cannot be changed on UDP sockets, the return is
2461 * always 0 (which corresponds to the full checksum coverage of UDP). */ 2488 * always 0 (which corresponds to the full checksum coverage of UDP). */
2462 case UDPLITE_SEND_CSCOV: 2489 case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7fa1db447405..a1c4a78132d2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1240,6 +1240,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1240 if (mtu < IPV6_MIN_MTU) 1240 if (mtu < IPV6_MIN_MTU)
1241 return -EINVAL; 1241 return -EINVAL;
1242 cork->base.fragsize = mtu; 1242 cork->base.fragsize = mtu;
1243 cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
1244
1243 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1245 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1244 cork->base.flags |= IPCORK_ALLFRAG; 1246 cork->base.flags |= IPCORK_ALLFRAG;
1245 cork->base.length = 0; 1247 cork->base.length = 0;
@@ -1281,7 +1283,7 @@ static int __ip6_append_data(struct sock *sk,
1281 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1283 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1282 } 1284 }
1283 1285
1284 mtu = cork->fragsize; 1286 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1285 orig_mtu = mtu; 1287 orig_mtu = mtu;
1286 1288
1287 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1289 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1329,7 +1331,7 @@ emsgsize:
1329 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1331 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1330 headersize == sizeof(struct ipv6hdr) && 1332 headersize == sizeof(struct ipv6hdr) &&
1331 length <= mtu - headersize && 1333 length <= mtu - headersize &&
1332 !(flags & MSG_MORE) && 1334 (!(flags & MSG_MORE) || cork->gso_size) &&
1333 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1335 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1334 csummode = CHECKSUM_PARTIAL; 1336 csummode = CHECKSUM_PARTIAL;
1335 1337
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 824797f8d1ab..86b7dd58d4b4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
1023 * Sending 1023 * Sending
1024 */ 1024 */
1025 1025
1026static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6) 1026static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
1027 struct inet_cork *cork)
1027{ 1028{
1028 struct sock *sk = skb->sk; 1029 struct sock *sk = skb->sk;
1029 struct udphdr *uh; 1030 struct udphdr *uh;
@@ -1042,6 +1043,21 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
1042 uh->len = htons(len); 1043 uh->len = htons(len);
1043 uh->check = 0; 1044 uh->check = 0;
1044 1045
1046 if (cork->gso_size) {
1047 const int hlen = skb_network_header_len(skb) +
1048 sizeof(struct udphdr);
1049
1050 if (hlen + cork->gso_size > cork->fragsize)
1051 return -EINVAL;
1052 if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
1053 return -EINVAL;
1054 if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
1055 return -EIO;
1056
1057 skb_shinfo(skb)->gso_size = cork->gso_size;
1058 skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
1059 }
1060
1045 if (is_udplite) 1061 if (is_udplite)
1046 csum = udplite_csum(skb); 1062 csum = udplite_csum(skb);
1047 else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */ 1063 else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
@@ -1093,7 +1109,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
1093 if (!skb) 1109 if (!skb)
1094 goto out; 1110 goto out;
1095 1111
1096 err = udp_v6_send_skb(skb, &fl6); 1112 err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
1097 1113
1098out: 1114out:
1099 up->len = 0; 1115 up->len = 0;
@@ -1127,6 +1143,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1127 ipc6.hlimit = -1; 1143 ipc6.hlimit = -1;
1128 ipc6.tclass = -1; 1144 ipc6.tclass = -1;
1129 ipc6.dontfrag = -1; 1145 ipc6.dontfrag = -1;
1146 ipc6.gso_size = up->gso_size;
1130 sockc.tsflags = sk->sk_tsflags; 1147 sockc.tsflags = sk->sk_tsflags;
1131 1148
1132 /* destination address check */ 1149 /* destination address check */
@@ -1333,7 +1350,7 @@ back_from_confirm:
1333 msg->msg_flags, &cork, &sockc); 1350 msg->msg_flags, &cork, &sockc);
1334 err = PTR_ERR(skb); 1351 err = PTR_ERR(skb);
1335 if (!IS_ERR_OR_NULL(skb)) 1352 if (!IS_ERR_OR_NULL(skb))
1336 err = udp_v6_send_skb(skb, &fl6); 1353 err = udp_v6_send_skb(skb, &fl6, &cork.base);
1337 goto out; 1354 goto out;
1338 } 1355 }
1339 1356