aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMaciej Żenczykowski <maze@google.com>2018-06-03 13:41:17 -0400
committerDavid S. Miller <davem@davemloft.net>2018-06-04 17:13:35 -0400
commit79e9fed460385a3d8ba0b5782e9e74405cb199b1 (patch)
treeed08428854481e17c476dcaf2f2ab8ddf5038d45
parent39dbc646fd2c67ee9b71450ce172cbd714d4e7fb (diff)
net-tcp: extend tcp_tw_reuse sysctl to enable loopback only optimization
This changes the /proc/sys/net/ipv4/tcp_tw_reuse from a boolean to an integer. It now takes the values 0, 1 and 2, where 0 and 1 behave as before, while 2 enables timewait socket reuse only for sockets that we can prove are loopback connections: ie. bound to 'lo' interface or where one of source or destination IPs is 127.0.0.0/8, ::ffff:127.0.0.0/104 or ::1. This enables quicker reuse of ephemeral ports for loopback connections - where tcp_tw_reuse is 100% safe from a protocol perspective (this assumes no artificially induced packet loss on 'lo'). This also makes estblishing many loopback connections *much* faster (allocating ports out of the first half of the ephemeral port range is significantly faster, then allocating from the second half) Without this change in a 32K ephemeral port space my sample program (it just establishes and closes [::1]:ephemeral -> [::1]:server_port connections in a tight loop) fails after 32765 connections in 24 seconds. With it enabled 50000 connections only take 4.7 seconds. This is particularly problematic for IPv6 where we only have one local address and cannot play tricks with varying source IP from 127.0.0.0/8 pool. Signed-off-by: Maciej Żenczykowski <maze@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Wei Wang <weiwan@google.com> Change-Id: I0377961749979d0301b7b62871a32a4b34b654e1 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt10
-rw-r--r--net/ipv4/sysctl_net_ipv4.c5
-rw-r--r--net/ipv4/tcp_ipv4.c35
3 files changed, 43 insertions, 7 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 924bd51327b7..6841c74eac00 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -667,11 +667,15 @@ tcp_tso_win_divisor - INTEGER
667 building larger TSO frames. 667 building larger TSO frames.
668 Default: 3 668 Default: 3
669 669
670tcp_tw_reuse - BOOLEAN 670tcp_tw_reuse - INTEGER
671 Allow to reuse TIME-WAIT sockets for new connections when it is 671 Enable reuse of TIME-WAIT sockets for new connections when it is
672 safe from protocol viewpoint. Default value is 0. 672 safe from protocol viewpoint.
673 0 - disable
674 1 - global enable
675 2 - enable for loopback traffic only
673 It should not be changed without advice/request of technical 676 It should not be changed without advice/request of technical
674 experts. 677 experts.
678 Default: 2
675 679
676tcp_window_scaling - BOOLEAN 680tcp_window_scaling - BOOLEAN
677 Enable window scaling as defined in RFC1323. 681 Enable window scaling as defined in RFC1323.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d2eed3ddcb0a..d06247ba08b2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -30,6 +30,7 @@
30 30
31static int zero; 31static int zero;
32static int one = 1; 32static int one = 1;
33static int two = 2;
33static int four = 4; 34static int four = 4;
34static int thousand = 1000; 35static int thousand = 1000;
35static int gso_max_segs = GSO_MAX_SEGS; 36static int gso_max_segs = GSO_MAX_SEGS;
@@ -845,7 +846,9 @@ static struct ctl_table ipv4_net_table[] = {
845 .data = &init_net.ipv4.sysctl_tcp_tw_reuse, 846 .data = &init_net.ipv4.sysctl_tcp_tw_reuse,
846 .maxlen = sizeof(int), 847 .maxlen = sizeof(int),
847 .mode = 0644, 848 .mode = 0644,
848 .proc_handler = proc_dointvec 849 .proc_handler = proc_dointvec_minmax,
850 .extra1 = &zero,
851 .extra2 = &two,
849 }, 852 },
850 { 853 {
851 .procname = "tcp_max_tw_buckets", 854 .procname = "tcp_max_tw_buckets",
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 749b0ef9f405..633963e228bc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
110 110
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{ 112{
113 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 114 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk); 115 struct tcp_sock *tp = tcp_sk(sk);
116 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117
118 if (reuse == 2) {
119 /* Still does not detect *everything* that goes through
120 * lo, since we require a loopback src or dst address
121 * or direct binding to 'lo' interface.
122 */
123 bool loopback = false;
124 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 loopback = true;
126#if IS_ENABLED(CONFIG_IPV6)
127 if (tw->tw_family == AF_INET6) {
128 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 loopback = true;
135 } else
136#endif
137 {
138 if (ipv4_is_loopback(tw->tw_daddr) ||
139 ipv4_is_loopback(tw->tw_rcv_saddr))
140 loopback = true;
141 }
142 if (!loopback)
143 reuse = 0;
144 }
115 145
116 /* With PAWS, it is safe from the viewpoint 146 /* With PAWS, it is safe from the viewpoint
117 of data integrity. Even without PAWS it is safe provided sequence 147 of data integrity. Even without PAWS it is safe provided sequence
@@ -125,8 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
125 and use initial timestamp retrieved from peer table. 155 and use initial timestamp retrieved from peer table.
126 */ 156 */
127 if (tcptw->tw_ts_recent_stamp && 157 if (tcptw->tw_ts_recent_stamp &&
128 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && 158 (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
129 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 159 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 if (tp->write_seq == 0) 160 if (tp->write_seq == 0)
132 tp->write_seq = 1; 161 tp->write_seq = 1;
@@ -2529,7 +2558,7 @@ static int __net_init tcp_sk_init(struct net *net)
2529 net->ipv4.sysctl_tcp_orphan_retries = 0; 2558 net->ipv4.sysctl_tcp_orphan_retries = 0;
2530 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; 2559 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2531 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; 2560 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2532 net->ipv4.sysctl_tcp_tw_reuse = 0; 2561 net->ipv4.sysctl_tcp_tw_reuse = 2;
2533 2562
2534 cnt = tcp_hashinfo.ehash_mask + 1; 2563 cnt = tcp_hashinfo.ehash_mask + 1;
2535 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; 2564 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;