aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_minisocks.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-04-12 21:51:09 -0400
committerDavid S. Miller <davem@davemloft.net>2015-04-13 16:40:05 -0400
commit789f558cfb3680aeb52de137418637f6b04b7d22 (patch)
tree0031c54a2fe41480ed509ba140a1c12ecad075a6 /net/ipv4/tcp_minisocks.c
parent20a1d16526b79c76cd45e29cb637aec1d43c41de (diff)
tcp/dccp: get rid of central timewait timer
Using a timer wheel for timewait sockets was nice ~15 years ago when memory was expensive and machines had a single processor. This does not scale, code is ugly and source of huge latencies (Typically 30 ms have been seen, cpus spinning on death_lock spinlock.) We can afford to use an extra 64 bytes per timewait sock and spread timewait load to all cpus to have better behavior. Tested: On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1 on the target (lpaa24) Before patch : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 419594 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 437171 While test is running, we can observe 25 or even 33 ms latencies. lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2 lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2 After patch : About 90% increase of throughput : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 810442 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 800992 And latencies are kept to minimal values during this load, even if network utilization is 90% higher : lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r--net/ipv4/tcp_minisocks.c35
1 files changed, 9 insertions, 26 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2088fdcca141..63d6311b5365 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly;
34 34
35struct inet_timewait_death_row tcp_death_row = { 35struct inet_timewait_death_row tcp_death_row = {
36 .sysctl_max_tw_buckets = NR_FILE * 2, 36 .sysctl_max_tw_buckets = NR_FILE * 2,
37 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
38 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
39 .hashinfo = &tcp_hashinfo, 37 .hashinfo = &tcp_hashinfo,
40 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
41 (unsigned long)&tcp_death_row),
42 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
43 inet_twdr_twkill_work),
44/* Short-time timewait calendar */
45
46 .twcal_hand = -1,
47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
48 (unsigned long)&tcp_death_row),
49}; 38};
50EXPORT_SYMBOL_GPL(tcp_death_row); 39EXPORT_SYMBOL_GPL(tcp_death_row);
51 40
@@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
158 if (!th->fin || 147 if (!th->fin ||
159 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { 148 TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
160kill_with_rst: 149kill_with_rst:
161 inet_twsk_deschedule(tw, &tcp_death_row); 150 inet_twsk_deschedule(tw);
162 inet_twsk_put(tw); 151 inet_twsk_put(tw);
163 return TCP_TW_RST; 152 return TCP_TW_RST;
164 } 153 }
@@ -174,11 +163,9 @@ kill_with_rst:
174 if (tcp_death_row.sysctl_tw_recycle && 163 if (tcp_death_row.sysctl_tw_recycle &&
175 tcptw->tw_ts_recent_stamp && 164 tcptw->tw_ts_recent_stamp &&
176 tcp_tw_remember_stamp(tw)) 165 tcp_tw_remember_stamp(tw))
177 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 166 inet_twsk_schedule(tw, tw->tw_timeout);
178 TCP_TIMEWAIT_LEN);
179 else 167 else
180 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 168 inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
181 TCP_TIMEWAIT_LEN);
182 return TCP_TW_ACK; 169 return TCP_TW_ACK;
183 } 170 }
184 171
@@ -211,13 +198,12 @@ kill_with_rst:
211 */ 198 */
212 if (sysctl_tcp_rfc1337 == 0) { 199 if (sysctl_tcp_rfc1337 == 0) {
213kill: 200kill:
214 inet_twsk_deschedule(tw, &tcp_death_row); 201 inet_twsk_deschedule(tw);
215 inet_twsk_put(tw); 202 inet_twsk_put(tw);
216 return TCP_TW_SUCCESS; 203 return TCP_TW_SUCCESS;
217 } 204 }
218 } 205 }
219 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 206 inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
220 TCP_TIMEWAIT_LEN);
221 207
222 if (tmp_opt.saw_tstamp) { 208 if (tmp_opt.saw_tstamp) {
223 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 209 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
@@ -267,8 +253,7 @@ kill:
267 * Do not reschedule in the last case. 253 * Do not reschedule in the last case.
268 */ 254 */
269 if (paws_reject || th->ack) 255 if (paws_reject || th->ack)
270 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 256 inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
271 TCP_TIMEWAIT_LEN);
272 257
273 return tcp_timewait_check_oow_rate_limit( 258 return tcp_timewait_check_oow_rate_limit(
274 tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); 259 tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
@@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process);
283 */ 268 */
284void tcp_time_wait(struct sock *sk, int state, int timeo) 269void tcp_time_wait(struct sock *sk, int state, int timeo)
285{ 270{
286 struct inet_timewait_sock *tw = NULL;
287 const struct inet_connection_sock *icsk = inet_csk(sk); 271 const struct inet_connection_sock *icsk = inet_csk(sk);
288 const struct tcp_sock *tp = tcp_sk(sk); 272 const struct tcp_sock *tp = tcp_sk(sk);
273 struct inet_timewait_sock *tw;
289 bool recycle_ok = false; 274 bool recycle_ok = false;
290 275
291 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 276 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
292 recycle_ok = tcp_remember_stamp(sk); 277 recycle_ok = tcp_remember_stamp(sk);
293 278
294 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 279 tw = inet_twsk_alloc(sk, &tcp_death_row, state);
295 tw = inet_twsk_alloc(sk, state);
296 280
297 if (tw) { 281 if (tw) {
298 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 282 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
355 timeo = TCP_TIMEWAIT_LEN; 339 timeo = TCP_TIMEWAIT_LEN;
356 } 340 }
357 341
358 inet_twsk_schedule(tw, &tcp_death_row, timeo, 342 inet_twsk_schedule(tw, timeo);
359 TCP_TIMEWAIT_LEN);
360 inet_twsk_put(tw); 343 inet_twsk_put(tw);
361 } else { 344 } else {
362 /* Sorry, if we're out of memory, just CLOSE this 345 /* Sorry, if we're out of memory, just CLOSE this