aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-07-22 23:27:07 -0400
committerDavid S. Miller <davem@davemloft.net>2013-07-24 20:54:48 -0400
commitc9bee3b7fdecb0c1d070c7b54113b3bdfb9a3d36 (patch)
tree6bcaa39bbd312c070bf31b703df4d6bb2a44fea9
parent64dc61306ce7da370833289739e2f52dfc6b37ba (diff)
tcp: TCP_NOTSENT_LOWAT socket option
Idea of this patch is to add optional limitation of number of unsent bytes in TCP sockets, to reduce usage of kernel memory. TCP receiver might announce a big window, and TCP sender autotuning might allow a large amount of bytes in write queue, but this has little performance impact if a large part of this buffering is wasted : Write queue needs to be large only to deal with large BDP, not necessarily to cope with scheduling delays (incoming ACKS make room for the application to queue more bytes) For most workloads, using a value of 128 KB or less is OK to give applications enough time to react to POLLOUT events in time (or being awaken in a blocking sendmsg()) This patch adds two ways to set the limit : 1) Per socket option TCP_NOTSENT_LOWAT 2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets not using TCP_NOTSENT_LOWAT socket option (or setting a zero value) Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect. This changes poll()/select()/epoll() to report POLLOUT only if number of unsent bytes is below tp->nosent_lowat Note this might increase number of sendmsg()/sendfile() calls when using non blocking sockets, and increase number of context switches for blocking sockets. Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is defined as : Specify the minimum number of bytes in the buffer until the socket layer will pass the data to the protocol) Tested: netperf sessions, and watching /proc/net/protocols "memory" column for TCP With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458) lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 45458 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 45458 no 208 yes kernel y y y y y y y y y y y y y n y y y y y lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 20567 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 20567 no 208 yes kernel y y y y y y y y y y y y y n y y y y y Using 128KB has no bad effect on the throughput or cpu usage of a single flow, although there is an increase of context switches. A bonus is that we hold socket lock for a shorter amount of time and should improve latencies of ACK processing. lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1651584 6291456 16384 20.00 17447.90 10^6bits/s 3.13 S -1.00 U 0.353 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 412,514 context-switches 200.034645535 seconds time elapsed lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1593240 6291456 16384 20.00 17321.16 10^6bits/s 3.35 S -1.00 U 0.381 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 2,675,818 context-switches 200.029651391 seconds time elapsed Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-By: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt13
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/sock.h19
-rw-r--r--include/net/tcp.h14
-rw-r--r--include/uapi/linux/tcp.h1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c7
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c3
-rw-r--r--net/ipv6/tcp_ipv6.c1
10 files changed, 61 insertions, 6 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 10742902146f..53cea9bcb14c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -516,6 +516,19 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max
516 this value is ignored. 516 this value is ignored.
517 Default: between 64K and 4MB, depending on RAM size. 517 Default: between 64K and 4MB, depending on RAM size.
518 518
519tcp_notsent_lowat - UNSIGNED INTEGER
520 A TCP socket can control the amount of unsent bytes in its write queue,
521 thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll()
522 reports POLLOUT events if the amount of unsent bytes is below a per
523 socket value, and if the write queue is not full. sendmsg() will
524 also not add new buffers if the limit is hit.
525
526 This global variable controls the amount of unsent data for
527 sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change
528 to the global variable has immediate effect.
529
530 Default: UINT_MAX (0xFFFFFFFF)
531
519tcp_workaround_signed_windows - BOOLEAN 532tcp_workaround_signed_windows - BOOLEAN
520 If set, assume no receipt of a window scaling option means the 533 If set, assume no receipt of a window scaling option means the
521 remote TCP is broken and treats the window as a signed quantity. 534 remote TCP is broken and treats the window as a signed quantity.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 472120b4fac5..9640803a17a7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -238,6 +238,7 @@ struct tcp_sock {
238 238
239 u32 rcv_wnd; /* Current receiver window */ 239 u32 rcv_wnd; /* Current receiver window */
240 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ 240 u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
241 u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
241 u32 pushed_seq; /* Last pushed seq, required to talk to windows */ 242 u32 pushed_seq; /* Last pushed seq, required to talk to windows */
242 u32 lost_out; /* Lost packets */ 243 u32 lost_out; /* Lost packets */
243 u32 sacked_out; /* SACK'd packets */ 244 u32 sacked_out; /* SACK'd packets */
diff --git a/include/net/sock.h b/include/net/sock.h
index d0b5fdee50a2..b9f2b095b1ab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk)
746 746
747extern void sk_stream_write_space(struct sock *sk); 747extern void sk_stream_write_space(struct sock *sk);
748 748
749static inline bool sk_stream_memory_free(const struct sock *sk)
750{
751 return sk->sk_wmem_queued < sk->sk_sndbuf;
752}
753
754/* OOB backlog add */ 749/* OOB backlog add */
755static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) 750static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
756{ 751{
@@ -950,6 +945,7 @@ struct proto {
950 unsigned int inuse_idx; 945 unsigned int inuse_idx;
951#endif 946#endif
952 947
948 bool (*stream_memory_free)(const struct sock *sk);
953 /* Memory pressure */ 949 /* Memory pressure */
954 void (*enter_memory_pressure)(struct sock *sk); 950 void (*enter_memory_pressure)(struct sock *sk);
955 atomic_long_t *memory_allocated; /* Current allocated memory. */ 951 atomic_long_t *memory_allocated; /* Current allocated memory. */
@@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto,
1088} 1084}
1089#endif 1085#endif
1090 1086
1087static inline bool sk_stream_memory_free(const struct sock *sk)
1088{
1089 if (sk->sk_wmem_queued >= sk->sk_sndbuf)
1090 return false;
1091
1092 return sk->sk_prot->stream_memory_free ?
1093 sk->sk_prot->stream_memory_free(sk) : true;
1094}
1095
1091static inline bool sk_stream_is_writeable(const struct sock *sk) 1096static inline bool sk_stream_is_writeable(const struct sock *sk)
1092{ 1097{
1093 return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk); 1098 return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
1099 sk_stream_memory_free(sk);
1094} 1100}
1095 1101
1102
1096static inline bool sk_has_memory_pressure(const struct sock *sk) 1103static inline bool sk_has_memory_pressure(const struct sock *sk)
1097{ 1104{
1098 return sk->sk_prot->memory_pressure != NULL; 1105 return sk->sk_prot->memory_pressure != NULL;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c5868471abae..18fc999dae3c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack;
284extern int sysctl_tcp_early_retrans; 284extern int sysctl_tcp_early_retrans;
285extern int sysctl_tcp_limit_output_bytes; 285extern int sysctl_tcp_limit_output_bytes;
286extern int sysctl_tcp_challenge_ack_limit; 286extern int sysctl_tcp_challenge_ack_limit;
287extern unsigned int sysctl_tcp_notsent_lowat;
287 288
288extern atomic_long_t tcp_memory_allocated; 289extern atomic_long_t tcp_memory_allocated;
289extern struct percpu_counter tcp_sockets_allocated; 290extern struct percpu_counter tcp_sockets_allocated;
@@ -1539,6 +1540,19 @@ extern int tcp_gro_complete(struct sk_buff *skb);
1539extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, 1540extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr,
1540 __be32 daddr); 1541 __be32 daddr);
1541 1542
1543static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
1544{
1545 return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat;
1546}
1547
1548static inline bool tcp_stream_memory_free(const struct sock *sk)
1549{
1550 const struct tcp_sock *tp = tcp_sk(sk);
1551 u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
1552
1553 return notsent_bytes < tcp_notsent_lowat(tp);
1554}
1555
1542#ifdef CONFIG_PROC_FS 1556#ifdef CONFIG_PROC_FS
1543extern int tcp4_proc_init(void); 1557extern int tcp4_proc_init(void);
1544extern void tcp4_proc_exit(void); 1558extern void tcp4_proc_exit(void);
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8d776ebc4829..377f1e59411d 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -111,6 +111,7 @@ enum {
111#define TCP_REPAIR_OPTIONS 22 111#define TCP_REPAIR_OPTIONS 22
112#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ 112#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
113#define TCP_TIMESTAMP 24 113#define TCP_TIMESTAMP 24
114#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
114 115
115struct tcp_repair_opt { 116struct tcp_repair_opt {
116 __u32 opt_code; 117 __u32 opt_code;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2c123c44d69..69ed203802da 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -555,6 +555,13 @@ static struct ctl_table ipv4_table[] = {
555 .extra1 = &one, 555 .extra1 = &one,
556 }, 556 },
557 { 557 {
558 .procname = "tcp_notsent_lowat",
559 .data = &sysctl_tcp_notsent_lowat,
560 .maxlen = sizeof(sysctl_tcp_notsent_lowat),
561 .mode = 0644,
562 .proc_handler = proc_dointvec,
563 },
564 {
558 .procname = "tcp_rmem", 565 .procname = "tcp_rmem",
559 .data = &sysctl_tcp_rmem, 566 .data = &sysctl_tcp_rmem,
560 .maxlen = sizeof(sysctl_tcp_rmem), 567 .maxlen = sizeof(sysctl_tcp_rmem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5eca9060bb8e..c27e81392398 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2631,6 +2631,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2631 else 2631 else
2632 tp->tsoffset = val - tcp_time_stamp; 2632 tp->tsoffset = val - tcp_time_stamp;
2633 break; 2633 break;
2634 case TCP_NOTSENT_LOWAT:
2635 tp->notsent_lowat = val;
2636 sk->sk_write_space(sk);
2637 break;
2634 default: 2638 default:
2635 err = -ENOPROTOOPT; 2639 err = -ENOPROTOOPT;
2636 break; 2640 break;
@@ -2847,6 +2851,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2847 case TCP_TIMESTAMP: 2851 case TCP_TIMESTAMP:
2848 val = tcp_time_stamp + tp->tsoffset; 2852 val = tcp_time_stamp + tp->tsoffset;
2849 break; 2853 break;
2854 case TCP_NOTSENT_LOWAT:
2855 val = tp->notsent_lowat;
2856 break;
2850 default: 2857 default:
2851 return -ENOPROTOOPT; 2858 return -ENOPROTOOPT;
2852 } 2859 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e3f129df0eb..2a5d5c469d17 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2800,6 +2800,7 @@ struct proto tcp_prot = {
2800 .unhash = inet_unhash, 2800 .unhash = inet_unhash,
2801 .get_port = inet_csk_get_port, 2801 .get_port = inet_csk_get_port,
2802 .enter_memory_pressure = tcp_enter_memory_pressure, 2802 .enter_memory_pressure = tcp_enter_memory_pressure,
2803 .stream_memory_free = tcp_stream_memory_free,
2803 .sockets_allocated = &tcp_sockets_allocated, 2804 .sockets_allocated = &tcp_sockets_allocated,
2804 .orphan_count = &tcp_orphan_count, 2805 .orphan_count = &tcp_orphan_count,
2805 .memory_allocated = &tcp_memory_allocated, 2806 .memory_allocated = &tcp_memory_allocated,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 92fde8d1aa82..884efff5b531 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
65/* By default, RFC2861 behavior. */ 65/* By default, RFC2861 behavior. */
66int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 66int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67 67
68unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
69EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
70
68static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
69 int push_one, gfp_t gfp); 72 int push_one, gfp_t gfp);
70 73
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 80fe69ef2188..b792e870686b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1924,6 +1924,7 @@ struct proto tcpv6_prot = {
1924 .unhash = inet_unhash, 1924 .unhash = inet_unhash,
1925 .get_port = inet_csk_get_port, 1925 .get_port = inet_csk_get_port,
1926 .enter_memory_pressure = tcp_enter_memory_pressure, 1926 .enter_memory_pressure = tcp_enter_memory_pressure,
1927 .stream_memory_free = tcp_stream_memory_free,
1927 .sockets_allocated = &tcp_sockets_allocated, 1928 .sockets_allocated = &tcp_sockets_allocated,
1928 .memory_allocated = &tcp_memory_allocated, 1929 .memory_allocated = &tcp_memory_allocated,
1929 .memory_pressure = &tcp_memory_pressure, 1930 .memory_pressure = &tcp_memory_pressure,