aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2016-04-29 17:16:53 -0400
committerDavid S. Miller <davem@davemloft.net>2016-05-02 17:02:26 -0400
commitd41a69f1d390fa3f2546498103cdcd78b30676ff (patch)
tree6b277c34e05b64524f3863ef29d9c222ddf23579
parent5413d1babe8f10de13d72496c12b862eef8ba613 (diff)
tcp: make tcp_sendmsg() aware of socket backlog
Large sendmsg()/write() hold socket lock for the duration of the call, unless sk->sk_sndbuf limit is hit. This is bad because incoming packets are parked into socket backlog for a long time. Critical decisions like fast retransmit might be delayed. Receivers have to maintain a big out of order queue with additional cpu overhead, and also possible stalls in TX once windows are full. Bidirectional flows are particularly hurt since the backlog can become quite big if the copy from user space triggers IO (page faults) Some applications learnt to use sendmsg() (or sendmmsg()) with small chunks to avoid this issue. Kernel should know better, right ? Add a generic sk_flush_backlog() helper and use it right before a new skb is allocated. Typically we put 64KB of payload per skb (unless MSG_EOR is requested) and checking socket backlog every 64KB gives good results. As a matter of fact, tests with TSO/GSO disabled give very nice results, as we manage to keep a small write queue and smaller perceived rtt. Note that sk_flush_backlog() maintains socket ownership, so is not equivalent to a {release_sock(sk); lock_sock(sk);}, to ensure implicit atomicity rules that sendmsg() was giving to (possibly buggy) applications. In this simple implementation, I chose to not call tcp_release_cb(), but we might consider this later. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Alexei Starovoitov <ast@fb.com> Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/sock.h11
-rw-r--r--net/core/sock.c7
-rw-r--r--net/ipv4/tcp.c8
3 files changed, 24 insertions, 2 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 3df778ccaa82..1dbb1f9f7c1b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -926,6 +926,17 @@ void sk_stream_kill_queues(struct sock *sk);
926void sk_set_memalloc(struct sock *sk); 926void sk_set_memalloc(struct sock *sk);
927void sk_clear_memalloc(struct sock *sk); 927void sk_clear_memalloc(struct sock *sk);
928 928
929void __sk_flush_backlog(struct sock *sk);
930
931static inline bool sk_flush_backlog(struct sock *sk)
932{
933 if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
934 __sk_flush_backlog(sk);
935 return true;
936 }
937 return false;
938}
939
929int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); 940int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);
930 941
931struct request_sock_ops; 942struct request_sock_ops;
diff --git a/net/core/sock.c b/net/core/sock.c
index 70744dbb6c3f..f615e9391170 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2048,6 +2048,13 @@ static void __release_sock(struct sock *sk)
2048 sk->sk_backlog.len = 0; 2048 sk->sk_backlog.len = 0;
2049} 2049}
2050 2050
2051void __sk_flush_backlog(struct sock *sk)
2052{
2053 spin_lock_bh(&sk->sk_lock.slock);
2054 __release_sock(sk);
2055 spin_unlock_bh(&sk->sk_lock.slock);
2056}
2057
2051/** 2058/**
2052 * sk_wait_data - wait for data to arrive at sk_receive_queue 2059 * sk_wait_data - wait for data to arrive at sk_receive_queue
2053 * @sk: sock to wait on 2060 * @sk: sock to wait on
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4787f86ae64c..b945c2b046c5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1136,11 +1136,12 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1136 /* This should be in poll */ 1136 /* This should be in poll */
1137 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1137 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1138 1138
1139 mss_now = tcp_send_mss(sk, &size_goal, flags);
1140
1141 /* Ok commence sending. */ 1139 /* Ok commence sending. */
1142 copied = 0; 1140 copied = 0;
1143 1141
1142restart:
1143 mss_now = tcp_send_mss(sk, &size_goal, flags);
1144
1144 err = -EPIPE; 1145 err = -EPIPE;
1145 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 1146 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1146 goto out_err; 1147 goto out_err;
@@ -1166,6 +1167,9 @@ new_segment:
1166 if (!sk_stream_memory_free(sk)) 1167 if (!sk_stream_memory_free(sk))
1167 goto wait_for_sndbuf; 1168 goto wait_for_sndbuf;
1168 1169
1170 if (sk_flush_backlog(sk))
1171 goto restart;
1172
1169 skb = sk_stream_alloc_skb(sk, 1173 skb = sk_stream_alloc_skb(sk,
1170 select_size(sk, sg), 1174 select_size(sk, sg),
1171 sk->sk_allocation, 1175 sk->sk_allocation,