aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-12-06 01:36:05 -0500
committerDavid S. Miller <davem@davemloft.net>2013-12-06 12:51:41 -0500
commitf54b311142a92ea2e42598e347b84e1655caf8e3 (patch)
tree64976e75ab165d657053f025bf6776c0a07d2a55
parentd8535a0a02d6e772122339c131151dfaf007866b (diff)
tcp: auto corking
With the introduction of TCP Small Queues, TSO auto sizing, and TCP pacing, we can implement Automatic Corking in the kernel, to help applications doing small write()/sendmsg() to TCP sockets. Idea is to change tcp_push() to check if the current skb payload is under skb optimal size (a multiple of MSS bytes) If under 'size_goal', and at least one packet is still in Qdisc or NIC TX queues, set the TCP Small Queue Throttled bit, so that the push will be delayed up to TX completion time. This delay might allow the application to coalesce more bytes in the skb in following write()/sendmsg()/sendfile() system calls. The exact duration of the delay is depending on the dynamics of the system, and might be zero if no packet for this flow is actually held in Qdisc or NIC TX ring. Using FQ/pacing is a way to increase the probability of autocorking being triggered. Add a new sysctl (/proc/sys/net/ipv4/tcp_autocorking) to control this feature and default it to 1 (enabled) Add a new SNMP counter : nstat -a | grep TcpExtTCPAutoCorking This counter is incremented every time we detected skb was under used and its flush was deferred. Tested: Interesting effects when using line buffered commands under ssh. Excellent performance results in term of cpu usage and total throughput. lpq83:~# echo 1 >/proc/sys/net/ipv4/tcp_autocorking lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128 9410.39 Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128': 35209.439626 task-clock # 2.901 CPUs utilized 2,294 context-switches # 0.065 K/sec 101 CPU-migrations # 0.003 K/sec 4,079 page-faults # 0.116 K/sec 97,923,241,298 cycles # 2.781 GHz [83.31%] 51,832,908,236 stalled-cycles-frontend # 52.93% frontend cycles idle [83.30%] 25,697,986,603 stalled-cycles-backend # 26.24% backend cycles idle [66.70%] 102,225,978,536 instructions # 1.04 insns per cycle # 0.51 stalled cycles per insn [83.38%] 18,657,696,819 branches # 529.906 M/sec [83.29%] 91,679,646 branch-misses # 0.49% of all branches [83.40%] 12.136204899 seconds time elapsed lpq83:~# echo 0 >/proc/sys/net/ipv4/tcp_autocorking lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128 6624.89 Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128': 40045.864494 task-clock # 3.301 CPUs utilized 171 context-switches # 0.004 K/sec 53 CPU-migrations # 0.001 K/sec 4,080 page-faults # 0.102 K/sec 111,340,458,645 cycles # 2.780 GHz [83.34%] 61,778,039,277 stalled-cycles-frontend # 55.49% frontend cycles idle [83.31%] 29,295,522,759 stalled-cycles-backend # 26.31% backend cycles idle [66.67%] 108,654,349,355 instructions # 0.98 insns per cycle # 0.57 stalled cycles per insn [83.34%] 19,552,170,748 branches # 488.244 M/sec [83.34%] 157,875,417 branch-misses # 0.81% of all branches [83.34%] 12.130267788 seconds time elapsed Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt10
-rw-r--r--include/net/tcp.h1
-rw-r--r--include/uapi/linux/snmp.h1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp.c63
6 files changed, 72 insertions, 13 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 3c12d9a7ed00..12ba2cd9f03d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -156,6 +156,16 @@ tcp_app_win - INTEGER
156 buffer. Value 0 is special, it means that nothing is reserved. 156 buffer. Value 0 is special, it means that nothing is reserved.
157 Default: 31 157 Default: 31
158 158
159tcp_autocorking - BOOLEAN
160 Enable TCP auto corking :
161 When applications do consecutive small write()/sendmsg() system calls,
162 we try to coalesce these small writes as much as possible, to lower
163 total amount of sent packets. This is done if at least one prior
164 packet for the flow is waiting in Qdisc queues or device transmit
165 queue. Applications can still use TCP_CORK for optimal behavior
166 when they know how/when to uncork their sockets.
167 Default : 1
168
159tcp_available_congestion_control - STRING 169tcp_available_congestion_control - STRING
160 Shows the available congestion control choices that are registered. 170 Shows the available congestion control choices that are registered.
161 More congestion control algorithms may be available as modules, 171 More congestion control algorithms may be available as modules,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70e55d200610..f7e1ab2139ef 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -282,6 +282,7 @@ extern int sysctl_tcp_limit_output_bytes;
282extern int sysctl_tcp_challenge_ack_limit; 282extern int sysctl_tcp_challenge_ack_limit;
283extern unsigned int sysctl_tcp_notsent_lowat; 283extern unsigned int sysctl_tcp_notsent_lowat;
284extern int sysctl_tcp_min_tso_segs; 284extern int sysctl_tcp_min_tso_segs;
285extern int sysctl_tcp_autocorking;
285 286
286extern atomic_long_t tcp_memory_allocated; 287extern atomic_long_t tcp_memory_allocated;
287extern struct percpu_counter tcp_sockets_allocated; 288extern struct percpu_counter tcp_sockets_allocated;
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 1bdb4a39d1e1..bbaba22f2d1b 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -258,6 +258,7 @@ enum
258 LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ 258 LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
259 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ 259 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
260 LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ 260 LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */
261 LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */
261 __LINUX_MIB_MAX 262 __LINUX_MIB_MAX
262}; 263};
263 264
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4a0335854b89..8ecd7ad959b4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -279,6 +279,7 @@ static const struct snmp_mib snmp4_net_list[] = {
279 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), 279 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
280 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), 280 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
281 SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), 281 SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
282 SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
282 SNMP_MIB_SENTINEL 283 SNMP_MIB_SENTINEL
283}; 284};
284 285
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3d69ec8dac57..38c8ec90ff68 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -733,6 +733,15 @@ static struct ctl_table ipv4_table[] = {
733 .extra2 = &gso_max_segs, 733 .extra2 = &gso_max_segs,
734 }, 734 },
735 { 735 {
736 .procname = "tcp_autocorking",
737 .data = &sysctl_tcp_autocorking,
738 .maxlen = sizeof(int),
739 .mode = 0644,
740 .proc_handler = proc_dointvec_minmax,
741 .extra1 = &zero,
742 .extra2 = &one,
743 },
744 {
736 .procname = "udp_mem", 745 .procname = "udp_mem",
737 .data = &sysctl_udp_mem, 746 .data = &sysctl_udp_mem,
738 .maxlen = sizeof(sysctl_udp_mem), 747 .maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c4638e6f0238..0ca87547becb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -285,6 +285,8 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285 285
286int sysctl_tcp_min_tso_segs __read_mostly = 2; 286int sysctl_tcp_min_tso_segs __read_mostly = 2;
287 287
288int sysctl_tcp_autocorking __read_mostly = 1;
289
288struct percpu_counter tcp_orphan_count; 290struct percpu_counter tcp_orphan_count;
289EXPORT_SYMBOL_GPL(tcp_orphan_count); 291EXPORT_SYMBOL_GPL(tcp_orphan_count);
290 292
@@ -619,19 +621,52 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
619 tp->snd_up = tp->write_seq; 621 tp->snd_up = tp->write_seq;
620} 622}
621 623
622static inline void tcp_push(struct sock *sk, int flags, int mss_now, 624/* If a not yet filled skb is pushed, do not send it if
623 int nonagle) 625 * we have packets in Qdisc or NIC queues :
626 * Because TX completion will happen shortly, it gives a chance
627 * to coalesce future sendmsg() payload into this skb, without
628 * need for a timer, and with no latency trade off.
629 * As packets containing data payload have a bigger truesize
630 * than pure acks (dataless) packets, the last check prevents
631 * autocorking if we only have an ACK in Qdisc/NIC queues.
632 */
633static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
634 int size_goal)
624{ 635{
625 if (tcp_send_head(sk)) { 636 return skb->len < size_goal &&
626 struct tcp_sock *tp = tcp_sk(sk); 637 sysctl_tcp_autocorking &&
638 atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
639}
640
641static void tcp_push(struct sock *sk, int flags, int mss_now,
642 int nonagle, int size_goal)
643{
644 struct tcp_sock *tp = tcp_sk(sk);
645 struct sk_buff *skb;
627 646
628 if (!(flags & MSG_MORE) || forced_push(tp)) 647 if (!tcp_send_head(sk))
629 tcp_mark_push(tp, tcp_write_queue_tail(sk)); 648 return;
649
650 skb = tcp_write_queue_tail(sk);
651 if (!(flags & MSG_MORE) || forced_push(tp))
652 tcp_mark_push(tp, skb);
653
654 tcp_mark_urg(tp, flags);
655
656 if (tcp_should_autocork(sk, skb, size_goal)) {
630 657
631 tcp_mark_urg(tp, flags); 658 /* avoid atomic op if TSQ_THROTTLED bit is already set */
632 __tcp_push_pending_frames(sk, mss_now, 659 if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
633 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 660 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
661 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
662 }
663 return;
634 } 664 }
665
666 if (flags & MSG_MORE)
667 nonagle = TCP_NAGLE_CORK;
668
669 __tcp_push_pending_frames(sk, mss_now, nonagle);
635} 670}
636 671
637static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, 672static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
@@ -934,7 +969,8 @@ new_segment:
934wait_for_sndbuf: 969wait_for_sndbuf:
935 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 970 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
936wait_for_memory: 971wait_for_memory:
937 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 972 tcp_push(sk, flags & ~MSG_MORE, mss_now,
973 TCP_NAGLE_PUSH, size_goal);
938 974
939 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 975 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
940 goto do_error; 976 goto do_error;
@@ -944,7 +980,7 @@ wait_for_memory:
944 980
945out: 981out:
946 if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) 982 if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
947 tcp_push(sk, flags, mss_now, tp->nonagle); 983 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
948 return copied; 984 return copied;
949 985
950do_error: 986do_error:
@@ -1225,7 +1261,8 @@ wait_for_sndbuf:
1225 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1261 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1226wait_for_memory: 1262wait_for_memory:
1227 if (copied) 1263 if (copied)
1228 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 1264 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1265 TCP_NAGLE_PUSH, size_goal);
1229 1266
1230 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1267 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1231 goto do_error; 1268 goto do_error;
@@ -1236,7 +1273,7 @@ wait_for_memory:
1236 1273
1237out: 1274out:
1238 if (copied) 1275 if (copied)
1239 tcp_push(sk, flags, mss_now, tp->nonagle); 1276 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1240 release_sock(sk); 1277 release_sock(sk);
1241 return copied + copied_syn; 1278 return copied + copied_syn;
1242 1279