diff options
author | Eric Dumazet <edumazet@google.com> | 2015-04-12 21:51:09 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-04-13 16:40:05 -0400 |
commit | 789f558cfb3680aeb52de137418637f6b04b7d22 (patch) | |
tree | 0031c54a2fe41480ed509ba140a1c12ecad075a6 /net/ipv4/tcp_minisocks.c | |
parent | 20a1d16526b79c76cd45e29cb637aec1d43c41de (diff) |
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.
This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)
We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.
Tested:
On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)
Before patch :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171
While test is running, we can observe 25 or even 33 ms latencies.
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2
After patch :
About 90% increase of throughput :
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442
lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992
And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :
lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 35 |
1 files changed, 9 insertions, 26 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2088fdcca141..63d6311b5365 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly; | |||
34 | 34 | ||
35 | struct inet_timewait_death_row tcp_death_row = { | 35 | struct inet_timewait_death_row tcp_death_row = { |
36 | .sysctl_max_tw_buckets = NR_FILE * 2, | 36 | .sysctl_max_tw_buckets = NR_FILE * 2, |
37 | .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, | ||
38 | .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), | ||
39 | .hashinfo = &tcp_hashinfo, | 37 | .hashinfo = &tcp_hashinfo, |
40 | .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, | ||
41 | (unsigned long)&tcp_death_row), | ||
42 | .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, | ||
43 | inet_twdr_twkill_work), | ||
44 | /* Short-time timewait calendar */ | ||
45 | |||
46 | .twcal_hand = -1, | ||
47 | .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, | ||
48 | (unsigned long)&tcp_death_row), | ||
49 | }; | 38 | }; |
50 | EXPORT_SYMBOL_GPL(tcp_death_row); | 39 | EXPORT_SYMBOL_GPL(tcp_death_row); |
51 | 40 | ||
@@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, | |||
158 | if (!th->fin || | 147 | if (!th->fin || |
159 | TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { | 148 | TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { |
160 | kill_with_rst: | 149 | kill_with_rst: |
161 | inet_twsk_deschedule(tw, &tcp_death_row); | 150 | inet_twsk_deschedule(tw); |
162 | inet_twsk_put(tw); | 151 | inet_twsk_put(tw); |
163 | return TCP_TW_RST; | 152 | return TCP_TW_RST; |
164 | } | 153 | } |
@@ -174,11 +163,9 @@ kill_with_rst: | |||
174 | if (tcp_death_row.sysctl_tw_recycle && | 163 | if (tcp_death_row.sysctl_tw_recycle && |
175 | tcptw->tw_ts_recent_stamp && | 164 | tcptw->tw_ts_recent_stamp && |
176 | tcp_tw_remember_stamp(tw)) | 165 | tcp_tw_remember_stamp(tw)) |
177 | inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, | 166 | inet_twsk_schedule(tw, tw->tw_timeout); |
178 | TCP_TIMEWAIT_LEN); | ||
179 | else | 167 | else |
180 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, | 168 | inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); |
181 | TCP_TIMEWAIT_LEN); | ||
182 | return TCP_TW_ACK; | 169 | return TCP_TW_ACK; |
183 | } | 170 | } |
184 | 171 | ||
@@ -211,13 +198,12 @@ kill_with_rst: | |||
211 | */ | 198 | */ |
212 | if (sysctl_tcp_rfc1337 == 0) { | 199 | if (sysctl_tcp_rfc1337 == 0) { |
213 | kill: | 200 | kill: |
214 | inet_twsk_deschedule(tw, &tcp_death_row); | 201 | inet_twsk_deschedule(tw); |
215 | inet_twsk_put(tw); | 202 | inet_twsk_put(tw); |
216 | return TCP_TW_SUCCESS; | 203 | return TCP_TW_SUCCESS; |
217 | } | 204 | } |
218 | } | 205 | } |
219 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, | 206 | inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); |
220 | TCP_TIMEWAIT_LEN); | ||
221 | 207 | ||
222 | if (tmp_opt.saw_tstamp) { | 208 | if (tmp_opt.saw_tstamp) { |
223 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; | 209 | tcptw->tw_ts_recent = tmp_opt.rcv_tsval; |
@@ -267,8 +253,7 @@ kill: | |||
267 | * Do not reschedule in the last case. | 253 | * Do not reschedule in the last case. |
268 | */ | 254 | */ |
269 | if (paws_reject || th->ack) | 255 | if (paws_reject || th->ack) |
270 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, | 256 | inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); |
271 | TCP_TIMEWAIT_LEN); | ||
272 | 257 | ||
273 | return tcp_timewait_check_oow_rate_limit( | 258 | return tcp_timewait_check_oow_rate_limit( |
274 | tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); | 259 | tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); |
@@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process); | |||
283 | */ | 268 | */ |
284 | void tcp_time_wait(struct sock *sk, int state, int timeo) | 269 | void tcp_time_wait(struct sock *sk, int state, int timeo) |
285 | { | 270 | { |
286 | struct inet_timewait_sock *tw = NULL; | ||
287 | const struct inet_connection_sock *icsk = inet_csk(sk); | 271 | const struct inet_connection_sock *icsk = inet_csk(sk); |
288 | const struct tcp_sock *tp = tcp_sk(sk); | 272 | const struct tcp_sock *tp = tcp_sk(sk); |
273 | struct inet_timewait_sock *tw; | ||
289 | bool recycle_ok = false; | 274 | bool recycle_ok = false; |
290 | 275 | ||
291 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) | 276 | if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) |
292 | recycle_ok = tcp_remember_stamp(sk); | 277 | recycle_ok = tcp_remember_stamp(sk); |
293 | 278 | ||
294 | if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) | 279 | tw = inet_twsk_alloc(sk, &tcp_death_row, state); |
295 | tw = inet_twsk_alloc(sk, state); | ||
296 | 280 | ||
297 | if (tw) { | 281 | if (tw) { |
298 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | 282 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); |
@@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
355 | timeo = TCP_TIMEWAIT_LEN; | 339 | timeo = TCP_TIMEWAIT_LEN; |
356 | } | 340 | } |
357 | 341 | ||
358 | inet_twsk_schedule(tw, &tcp_death_row, timeo, | 342 | inet_twsk_schedule(tw, timeo); |
359 | TCP_TIMEWAIT_LEN); | ||
360 | inet_twsk_put(tw); | 343 | inet_twsk_put(tw); |
361 | } else { | 344 | } else { |
362 | /* Sorry, if we're out of memory, just CLOSE this | 345 | /* Sorry, if we're out of memory, just CLOSE this |