aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorSoheil Hassas Yeganeh <soheil@google.com>2018-05-01 15:39:15 -0400
committerDavid S. Miller <davem@davemloft.net>2018-05-01 18:56:29 -0400
commitb75eba76d3d72e2374fac999926dafef2997edd2 (patch)
tree43e3ae67eeeae91a7d17c68a40140840494fcd70 /net/ipv4/tcp.c
parentab85539eb39773b9a155f98dd42984528a9c6e6c (diff)
tcp: send in-queue bytes in cmsg upon read
Applications with many concurrent connections, high variance in receive queue length and tight memory bounds cannot allocate worst-case buffer size to drain sockets. Knowing the size of receive queue length, applications can optimize how they allocate buffers to read from the socket. The number of bytes pending on the socket is directly available through ioctl(FIONREAD/SIOCINQ) and can be approximated using getsockopt(MEMINFO) (rmem_alloc includes skb overheads in addition to application data). But, both of these options add an extra syscall per recvmsg. Moreover, ioctl(FIONREAD/SIOCINQ) takes the socket lock. Add the TCP_INQ socket option to TCP. When this socket option is set, recvmsg() relays the number of bytes available on the socket for reading to the application via the TCP_CM_INQ control message. Calculate the number of bytes after releasing the socket lock to include the processed backlog, if any. To avoid an extra branch in the hot path of recvmsg() for this new control message, move all cmsg processing inside an existing branch for processing receive timestamps. Since the socket lock is not held when calculating the size of receive queue, TCP_INQ is a hint. For example, it can overestimate the queue size by one byte, if FIN is received. With this method, applications can start reading from the socket using a small buffer, and then use larger buffers based on the remaining data when needed. V3 change-log: As suggested by David Miller, added loads with barrier to check whether we have multiple threads calling recvmsg in parallel. When that happens we lock the socket to calculate inq. V4 change-log: Removed inline from a static function. Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Willem de Bruijn <willemb@google.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Neal Cardwell <ncardwell@google.com> Suggested-by: David Miller <davem@davemloft.net> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c43
1 files changed, 39 insertions, 4 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4028ddd14dd5..868ed74a76a8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1889,6 +1889,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1889 } 1889 }
1890} 1890}
1891 1891
1892static int tcp_inq_hint(struct sock *sk)
1893{
1894 const struct tcp_sock *tp = tcp_sk(sk);
1895 u32 copied_seq = READ_ONCE(tp->copied_seq);
1896 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1897 int inq;
1898
1899 inq = rcv_nxt - copied_seq;
1900 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1901 lock_sock(sk);
1902 inq = tp->rcv_nxt - tp->copied_seq;
1903 release_sock(sk);
1904 }
1905 return inq;
1906}
1907
1892/* 1908/*
1893 * This routine copies from a sock struct into the user buffer. 1909 * This routine copies from a sock struct into the user buffer.
1894 * 1910 *
@@ -1905,13 +1921,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1905 u32 peek_seq; 1921 u32 peek_seq;
1906 u32 *seq; 1922 u32 *seq;
1907 unsigned long used; 1923 unsigned long used;
1908 int err; 1924 int err, inq;
1909 int target; /* Read at least this many bytes */ 1925 int target; /* Read at least this many bytes */
1910 long timeo; 1926 long timeo;
1911 struct sk_buff *skb, *last; 1927 struct sk_buff *skb, *last;
1912 u32 urg_hole = 0; 1928 u32 urg_hole = 0;
1913 struct scm_timestamping tss; 1929 struct scm_timestamping tss;
1914 bool has_tss = false; 1930 bool has_tss = false;
1931 bool has_cmsg;
1915 1932
1916 if (unlikely(flags & MSG_ERRQUEUE)) 1933 if (unlikely(flags & MSG_ERRQUEUE))
1917 return inet_recv_error(sk, msg, len, addr_len); 1934 return inet_recv_error(sk, msg, len, addr_len);
@@ -1926,6 +1943,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1926 if (sk->sk_state == TCP_LISTEN) 1943 if (sk->sk_state == TCP_LISTEN)
1927 goto out; 1944 goto out;
1928 1945
1946 has_cmsg = tp->recvmsg_inq;
1929 timeo = sock_rcvtimeo(sk, nonblock); 1947 timeo = sock_rcvtimeo(sk, nonblock);
1930 1948
1931 /* Urgent data needs to be handled specially. */ 1949 /* Urgent data needs to be handled specially. */
@@ -2112,6 +2130,7 @@ skip_copy:
2112 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2130 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2113 tcp_update_recv_tstamps(skb, &tss); 2131 tcp_update_recv_tstamps(skb, &tss);
2114 has_tss = true; 2132 has_tss = true;
2133 has_cmsg = true;
2115 } 2134 }
2116 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 2135 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2117 goto found_fin_ok; 2136 goto found_fin_ok;
@@ -2131,13 +2150,20 @@ skip_copy:
2131 * on connected socket. I was just happy when found this 8) --ANK 2150 * on connected socket. I was just happy when found this 8) --ANK
2132 */ 2151 */
2133 2152
2134 if (has_tss)
2135 tcp_recv_timestamp(msg, sk, &tss);
2136
2137 /* Clean up data we have read: This will do ACK frames. */ 2153 /* Clean up data we have read: This will do ACK frames. */
2138 tcp_cleanup_rbuf(sk, copied); 2154 tcp_cleanup_rbuf(sk, copied);
2139 2155
2140 release_sock(sk); 2156 release_sock(sk);
2157
2158 if (has_cmsg) {
2159 if (has_tss)
2160 tcp_recv_timestamp(msg, sk, &tss);
2161 if (tp->recvmsg_inq) {
2162 inq = tcp_inq_hint(sk);
2163 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2164 }
2165 }
2166
2141 return copied; 2167 return copied;
2142 2168
2143out: 2169out:
@@ -3006,6 +3032,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
3006 tp->notsent_lowat = val; 3032 tp->notsent_lowat = val;
3007 sk->sk_write_space(sk); 3033 sk->sk_write_space(sk);
3008 break; 3034 break;
3035 case TCP_INQ:
3036 if (val > 1 || val < 0)
3037 err = -EINVAL;
3038 else
3039 tp->recvmsg_inq = val;
3040 break;
3009 default: 3041 default:
3010 err = -ENOPROTOOPT; 3042 err = -ENOPROTOOPT;
3011 break; 3043 break;
@@ -3431,6 +3463,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3431 case TCP_NOTSENT_LOWAT: 3463 case TCP_NOTSENT_LOWAT:
3432 val = tp->notsent_lowat; 3464 val = tp->notsent_lowat;
3433 break; 3465 break;
3466 case TCP_INQ:
3467 val = tp->recvmsg_inq;
3468 break;
3434 case TCP_SAVE_SYN: 3469 case TCP_SAVE_SYN:
3435 val = tp->save_syn; 3470 val = tp->save_syn;
3436 break; 3471 break;