diff options
author | Eric Dumazet <edumazet@google.com> | 2013-12-06 01:36:05 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-12-06 12:51:41 -0500 |
commit | f54b311142a92ea2e42598e347b84e1655caf8e3 (patch) | |
tree | 64976e75ab165d657053f025bf6776c0a07d2a55 | |
parent | d8535a0a02d6e772122339c131151dfaf007866b (diff) |
tcp: auto corking
With the introduction of TCP Small Queues, TSO auto sizing, and TCP
pacing, we can implement Automatic Corking in the kernel, to help
applications doing small write()/sendmsg() to TCP sockets.
Idea is to change tcp_push() to check if the current skb payload is
under skb optimal size (a multiple of MSS bytes)
If under 'size_goal', and at least one packet is still in Qdisc or
NIC TX queues, set the TCP Small Queue Throttled bit, so that the push
will be delayed up to TX completion time.
This delay might allow the application to coalesce more bytes
in the skb in following write()/sendmsg()/sendfile() system calls.
The exact duration of the delay is depending on the dynamics
of the system, and might be zero if no packet for this flow
is actually held in Qdisc or NIC TX ring.
Using FQ/pacing is a way to increase the probability of
autocorking being triggered.
Add a new sysctl (/proc/sys/net/ipv4/tcp_autocorking) to control
this feature and default it to 1 (enabled)
Add a new SNMP counter : nstat -a | grep TcpExtTCPAutoCorking
This counter is incremented every time we detected skb was under used
and its flush was deferred.
Tested:
Interesting effects when using line buffered commands under ssh.
Excellent performance results in term of cpu usage and total throughput.
lpq83:~# echo 1 >/proc/sys/net/ipv4/tcp_autocorking
lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128
9410.39
Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128':
35209.439626 task-clock # 2.901 CPUs utilized
2,294 context-switches # 0.065 K/sec
101 CPU-migrations # 0.003 K/sec
4,079 page-faults # 0.116 K/sec
97,923,241,298 cycles # 2.781 GHz [83.31%]
51,832,908,236 stalled-cycles-frontend # 52.93% frontend cycles idle [83.30%]
25,697,986,603 stalled-cycles-backend # 26.24% backend cycles idle [66.70%]
102,225,978,536 instructions # 1.04 insns per cycle
# 0.51 stalled cycles per insn [83.38%]
18,657,696,819 branches # 529.906 M/sec [83.29%]
91,679,646 branch-misses # 0.49% of all branches [83.40%]
12.136204899 seconds time elapsed
lpq83:~# echo 0 >/proc/sys/net/ipv4/tcp_autocorking
lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128
6624.89
Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128':
40045.864494 task-clock # 3.301 CPUs utilized
171 context-switches # 0.004 K/sec
53 CPU-migrations # 0.001 K/sec
4,080 page-faults # 0.102 K/sec
111,340,458,645 cycles # 2.780 GHz [83.34%]
61,778,039,277 stalled-cycles-frontend # 55.49% frontend cycles idle [83.31%]
29,295,522,759 stalled-cycles-backend # 26.31% backend cycles idle [66.67%]
108,654,349,355 instructions # 0.98 insns per cycle
# 0.57 stalled cycles per insn [83.34%]
19,552,170,748 branches # 488.244 M/sec [83.34%]
157,875,417 branch-misses # 0.81% of all branches [83.34%]
12.130267788 seconds time elapsed
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 10 | ||||
-rw-r--r-- | include/net/tcp.h | 1 | ||||
-rw-r--r-- | include/uapi/linux/snmp.h | 1 | ||||
-rw-r--r-- | net/ipv4/proc.c | 1 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 63 |
6 files changed, 72 insertions, 13 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 3c12d9a7ed00..12ba2cd9f03d 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -156,6 +156,16 @@ tcp_app_win - INTEGER | |||
156 | buffer. Value 0 is special, it means that nothing is reserved. | 156 | buffer. Value 0 is special, it means that nothing is reserved. |
157 | Default: 31 | 157 | Default: 31 |
158 | 158 | ||
159 | tcp_autocorking - BOOLEAN | ||
160 | Enable TCP auto corking : | ||
161 | When applications do consecutive small write()/sendmsg() system calls, | ||
162 | we try to coalesce these small writes as much as possible, to lower | ||
163 | total amount of sent packets. This is done if at least one prior | ||
164 | packet for the flow is waiting in Qdisc queues or device transmit | ||
165 | queue. Applications can still use TCP_CORK for optimal behavior | ||
166 | when they know how/when to uncork their sockets. | ||
167 | Default : 1 | ||
168 | |||
159 | tcp_available_congestion_control - STRING | 169 | tcp_available_congestion_control - STRING |
160 | Shows the available congestion control choices that are registered. | 170 | Shows the available congestion control choices that are registered. |
161 | More congestion control algorithms may be available as modules, | 171 | More congestion control algorithms may be available as modules, |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 70e55d200610..f7e1ab2139ef 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -282,6 +282,7 @@ extern int sysctl_tcp_limit_output_bytes; | |||
282 | extern int sysctl_tcp_challenge_ack_limit; | 282 | extern int sysctl_tcp_challenge_ack_limit; |
283 | extern unsigned int sysctl_tcp_notsent_lowat; | 283 | extern unsigned int sysctl_tcp_notsent_lowat; |
284 | extern int sysctl_tcp_min_tso_segs; | 284 | extern int sysctl_tcp_min_tso_segs; |
285 | extern int sysctl_tcp_autocorking; | ||
285 | 286 | ||
286 | extern atomic_long_t tcp_memory_allocated; | 287 | extern atomic_long_t tcp_memory_allocated; |
287 | extern struct percpu_counter tcp_sockets_allocated; | 288 | extern struct percpu_counter tcp_sockets_allocated; |
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1bdb4a39d1e1..bbaba22f2d1b 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h | |||
@@ -258,6 +258,7 @@ enum | |||
258 | LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ | 258 | LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ |
259 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ | 259 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ |
260 | LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ | 260 | LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ |
261 | LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */ | ||
261 | __LINUX_MIB_MAX | 262 | __LINUX_MIB_MAX |
262 | }; | 263 | }; |
263 | 264 | ||
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4a0335854b89..8ecd7ad959b4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -279,6 +279,7 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
279 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), | 279 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), |
280 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), | 280 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), |
281 | SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), | 281 | SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), |
282 | SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), | ||
282 | SNMP_MIB_SENTINEL | 283 | SNMP_MIB_SENTINEL |
283 | }; | 284 | }; |
284 | 285 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3d69ec8dac57..38c8ec90ff68 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -733,6 +733,15 @@ static struct ctl_table ipv4_table[] = { | |||
733 | .extra2 = &gso_max_segs, | 733 | .extra2 = &gso_max_segs, |
734 | }, | 734 | }, |
735 | { | 735 | { |
736 | .procname = "tcp_autocorking", | ||
737 | .data = &sysctl_tcp_autocorking, | ||
738 | .maxlen = sizeof(int), | ||
739 | .mode = 0644, | ||
740 | .proc_handler = proc_dointvec_minmax, | ||
741 | .extra1 = &zero, | ||
742 | .extra2 = &one, | ||
743 | }, | ||
744 | { | ||
736 | .procname = "udp_mem", | 745 | .procname = "udp_mem", |
737 | .data = &sysctl_udp_mem, | 746 | .data = &sysctl_udp_mem, |
738 | .maxlen = sizeof(sysctl_udp_mem), | 747 | .maxlen = sizeof(sysctl_udp_mem), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4638e6f0238..0ca87547becb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -285,6 +285,8 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | |||
285 | 285 | ||
286 | int sysctl_tcp_min_tso_segs __read_mostly = 2; | 286 | int sysctl_tcp_min_tso_segs __read_mostly = 2; |
287 | 287 | ||
288 | int sysctl_tcp_autocorking __read_mostly = 1; | ||
289 | |||
288 | struct percpu_counter tcp_orphan_count; | 290 | struct percpu_counter tcp_orphan_count; |
289 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 291 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
290 | 292 | ||
@@ -619,19 +621,52 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) | |||
619 | tp->snd_up = tp->write_seq; | 621 | tp->snd_up = tp->write_seq; |
620 | } | 622 | } |
621 | 623 | ||
622 | static inline void tcp_push(struct sock *sk, int flags, int mss_now, | 624 | /* If a not yet filled skb is pushed, do not send it if |
623 | int nonagle) | 625 | * we have packets in Qdisc or NIC queues : |
626 | * Because TX completion will happen shortly, it gives a chance | ||
627 | * to coalesce future sendmsg() payload into this skb, without | ||
628 | * need for a timer, and with no latency trade off. | ||
629 | * As packets containing data payload have a bigger truesize | ||
630 | * than pure acks (dataless) packets, the last check prevents | ||
631 | * autocorking if we only have an ACK in Qdisc/NIC queues. | ||
632 | */ | ||
633 | static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, | ||
634 | int size_goal) | ||
624 | { | 635 | { |
625 | if (tcp_send_head(sk)) { | 636 | return skb->len < size_goal && |
626 | struct tcp_sock *tp = tcp_sk(sk); | 637 | sysctl_tcp_autocorking && |
638 | atomic_read(&sk->sk_wmem_alloc) > skb->truesize; | ||
639 | } | ||
640 | |||
641 | static void tcp_push(struct sock *sk, int flags, int mss_now, | ||
642 | int nonagle, int size_goal) | ||
643 | { | ||
644 | struct tcp_sock *tp = tcp_sk(sk); | ||
645 | struct sk_buff *skb; | ||
627 | 646 | ||
628 | if (!(flags & MSG_MORE) || forced_push(tp)) | 647 | if (!tcp_send_head(sk)) |
629 | tcp_mark_push(tp, tcp_write_queue_tail(sk)); | 648 | return; |
649 | |||
650 | skb = tcp_write_queue_tail(sk); | ||
651 | if (!(flags & MSG_MORE) || forced_push(tp)) | ||
652 | tcp_mark_push(tp, skb); | ||
653 | |||
654 | tcp_mark_urg(tp, flags); | ||
655 | |||
656 | if (tcp_should_autocork(sk, skb, size_goal)) { | ||
630 | 657 | ||
631 | tcp_mark_urg(tp, flags); | 658 | /* avoid atomic op if TSQ_THROTTLED bit is already set */ |
632 | __tcp_push_pending_frames(sk, mss_now, | 659 | if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { |
633 | (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); | 660 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); |
661 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | ||
662 | } | ||
663 | return; | ||
634 | } | 664 | } |
665 | |||
666 | if (flags & MSG_MORE) | ||
667 | nonagle = TCP_NAGLE_CORK; | ||
668 | |||
669 | __tcp_push_pending_frames(sk, mss_now, nonagle); | ||
635 | } | 670 | } |
636 | 671 | ||
637 | static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, | 672 | static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, |
@@ -934,7 +969,8 @@ new_segment: | |||
934 | wait_for_sndbuf: | 969 | wait_for_sndbuf: |
935 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 970 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
936 | wait_for_memory: | 971 | wait_for_memory: |
937 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); | 972 | tcp_push(sk, flags & ~MSG_MORE, mss_now, |
973 | TCP_NAGLE_PUSH, size_goal); | ||
938 | 974 | ||
939 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) | 975 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) |
940 | goto do_error; | 976 | goto do_error; |
@@ -944,7 +980,7 @@ wait_for_memory: | |||
944 | 980 | ||
945 | out: | 981 | out: |
946 | if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) | 982 | if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) |
947 | tcp_push(sk, flags, mss_now, tp->nonagle); | 983 | tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); |
948 | return copied; | 984 | return copied; |
949 | 985 | ||
950 | do_error: | 986 | do_error: |
@@ -1225,7 +1261,8 @@ wait_for_sndbuf: | |||
1225 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 1261 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
1226 | wait_for_memory: | 1262 | wait_for_memory: |
1227 | if (copied) | 1263 | if (copied) |
1228 | tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); | 1264 | tcp_push(sk, flags & ~MSG_MORE, mss_now, |
1265 | TCP_NAGLE_PUSH, size_goal); | ||
1229 | 1266 | ||
1230 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) | 1267 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) |
1231 | goto do_error; | 1268 | goto do_error; |
@@ -1236,7 +1273,7 @@ wait_for_memory: | |||
1236 | 1273 | ||
1237 | out: | 1274 | out: |
1238 | if (copied) | 1275 | if (copied) |
1239 | tcp_push(sk, flags, mss_now, tp->nonagle); | 1276 | tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); |
1240 | release_sock(sk); | 1277 | release_sock(sk); |
1241 | return copied + copied_syn; | 1278 | return copied + copied_syn; |
1242 | 1279 | ||