aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_minisocks.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r--net/ipv4/tcp_minisocks.c281
1 files changed, 10 insertions, 271 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 81b9a52c50c6..dc085233d512 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,12 +35,6 @@
35#define SYNC_INIT 1 35#define SYNC_INIT 1
36#endif 36#endif
37 37
38/* New-style handling of TIME_WAIT sockets. */
39
40static void inet_twdr_hangman(unsigned long data);
41static void inet_twdr_twkill_work(void *data);
42static void inet_twdr_twcal_tick(unsigned long data);
43
44int sysctl_tcp_syncookies = SYNC_INIT; 38int sysctl_tcp_syncookies = SYNC_INIT;
45int sysctl_tcp_abort_on_overflow; 39int sysctl_tcp_abort_on_overflow;
46 40
@@ -63,10 +57,6 @@ struct inet_timewait_death_row tcp_death_row = {
63 57
64EXPORT_SYMBOL_GPL(tcp_death_row); 58EXPORT_SYMBOL_GPL(tcp_death_row);
65 59
66static void inet_twsk_schedule(struct inet_timewait_sock *tw,
67 struct inet_timewait_death_row *twdr,
68 const int timeo);
69
70static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 60static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
71{ 61{
72 if (seq == s_win) 62 if (seq == s_win)
@@ -173,9 +163,11 @@ kill_with_rst:
173 if (tw->tw_family == AF_INET && 163 if (tw->tw_family == AF_INET &&
174 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && 164 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
175 tcp_v4_tw_remember_stamp(tw)) 165 tcp_v4_tw_remember_stamp(tw))
176 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout); 166 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
167 TCP_TIMEWAIT_LEN);
177 else 168 else
178 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); 169 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
170 TCP_TIMEWAIT_LEN);
179 return TCP_TW_ACK; 171 return TCP_TW_ACK;
180 } 172 }
181 173
@@ -213,7 +205,8 @@ kill:
213 return TCP_TW_SUCCESS; 205 return TCP_TW_SUCCESS;
214 } 206 }
215 } 207 }
216 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); 208 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
209 TCP_TIMEWAIT_LEN);
217 210
218 if (tmp_opt.saw_tstamp) { 211 if (tmp_opt.saw_tstamp) {
219 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 212 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
@@ -263,7 +256,8 @@ kill:
263 * Do not reschedule in the last case. 256 * Do not reschedule in the last case.
264 */ 257 */
265 if (paws_reject || th->ack) 258 if (paws_reject || th->ack)
266 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); 259 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
260 TCP_TIMEWAIT_LEN);
267 261
268 /* Send ACK. Note, we do not put the bucket, 262 /* Send ACK. Note, we do not put the bucket,
269 * it will be released by caller. 263 * it will be released by caller.
@@ -326,7 +320,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
326 timeo = TCP_TIMEWAIT_LEN; 320 timeo = TCP_TIMEWAIT_LEN;
327 } 321 }
328 322
329 inet_twsk_schedule(tw, &tcp_death_row, timeo); 323 inet_twsk_schedule(tw, &tcp_death_row, timeo,
324 TCP_TIMEWAIT_LEN);
330 inet_twsk_put(tw); 325 inet_twsk_put(tw);
331 } else { 326 } else {
332 /* Sorry, if we're out of memory, just CLOSE this 327 /* Sorry, if we're out of memory, just CLOSE this
@@ -341,261 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
341 tcp_done(sk); 336 tcp_done(sk);
342} 337}
343 338
344/* Returns non-zero if quota exceeded. */
345static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
346 const int slot)
347{
348 struct inet_timewait_sock *tw;
349 struct hlist_node *node;
350 unsigned int killed;
351 int ret;
352
353 /* NOTE: compare this to previous version where lock
354 * was released after detaching chain. It was racy,
355 * because tw buckets are scheduled in not serialized context
356 * in 2.3 (with netfilter), and with softnet it is common, because
357 * soft irqs are not sequenced.
358 */
359 killed = 0;
360 ret = 0;
361rescan:
362 inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
363 __inet_twsk_del_dead_node(tw);
364 spin_unlock(&twdr->death_lock);
365 __inet_twsk_kill(tw, twdr->hashinfo);
366 inet_twsk_put(tw);
367 killed++;
368 spin_lock(&twdr->death_lock);
369 if (killed > INET_TWDR_TWKILL_QUOTA) {
370 ret = 1;
371 break;
372 }
373
374 /* While we dropped twdr->death_lock, another cpu may have
375 * killed off the next TW bucket in the list, therefore
376 * do a fresh re-read of the hlist head node with the
377 * lock reacquired. We still use the hlist traversal
378 * macro in order to get the prefetches.
379 */
380 goto rescan;
381 }
382
383 twdr->tw_count -= killed;
384 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
385
386 return ret;
387}
388
389static void inet_twdr_hangman(unsigned long data)
390{
391 struct inet_timewait_death_row *twdr;
392 int unsigned need_timer;
393
394 twdr = (struct inet_timewait_death_row *)data;
395 spin_lock(&twdr->death_lock);
396
397 if (twdr->tw_count == 0)
398 goto out;
399
400 need_timer = 0;
401 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
402 twdr->thread_slots |= (1 << twdr->slot);
403 mb();
404 schedule_work(&twdr->twkill_work);
405 need_timer = 1;
406 } else {
407 /* We purged the entire slot, anything left? */
408 if (twdr->tw_count)
409 need_timer = 1;
410 }
411 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
412 if (need_timer)
413 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
414out:
415 spin_unlock(&twdr->death_lock);
416}
417
418extern void twkill_slots_invalid(void);
419
420static void inet_twdr_twkill_work(void *data)
421{
422 struct inet_timewait_death_row *twdr = data;
423 int i;
424
425 if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
426 twkill_slots_invalid();
427
428 while (twdr->thread_slots) {
429 spin_lock_bh(&twdr->death_lock);
430 for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
431 if (!(twdr->thread_slots & (1 << i)))
432 continue;
433
434 while (inet_twdr_do_twkill_work(twdr, i) != 0) {
435 if (need_resched()) {
436 spin_unlock_bh(&twdr->death_lock);
437 schedule();
438 spin_lock_bh(&twdr->death_lock);
439 }
440 }
441
442 twdr->thread_slots &= ~(1 << i);
443 }
444 spin_unlock_bh(&twdr->death_lock);
445 }
446}
447
448/* These are always called from BH context. See callers in
449 * tcp_input.c to verify this.
450 */
451
452/* This is for handling early-kills of TIME_WAIT sockets. */
453void inet_twsk_deschedule(struct inet_timewait_sock *tw,
454 struct inet_timewait_death_row *twdr)
455{
456 spin_lock(&twdr->death_lock);
457 if (inet_twsk_del_dead_node(tw)) {
458 inet_twsk_put(tw);
459 if (--twdr->tw_count == 0)
460 del_timer(&twdr->tw_timer);
461 }
462 spin_unlock(&twdr->death_lock);
463 __inet_twsk_kill(tw, twdr->hashinfo);
464}
465
466static void inet_twsk_schedule(struct inet_timewait_sock *tw,
467 struct inet_timewait_death_row *twdr,
468 const int timeo)
469{
470 struct hlist_head *list;
471 int slot;
472
473 /* timeout := RTO * 3.5
474 *
475 * 3.5 = 1+2+0.5 to wait for two retransmits.
476 *
477 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
478 * our ACK acking that FIN can be lost. If N subsequent retransmitted
479 * FINs (or previous seqments) are lost (probability of such event
480 * is p^(N+1), where p is probability to lose single packet and
481 * time to detect the loss is about RTO*(2^N - 1) with exponential
482 * backoff). Normal timewait length is calculated so, that we
483 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
484 * [ BTW Linux. following BSD, violates this requirement waiting
485 * only for 60sec, we should wait at least for 240 secs.
486 * Well, 240 consumes too much of resources 8)
487 * ]
488 * This interval is not reduced to catch old duplicate and
489 * responces to our wandering segments living for two MSLs.
490 * However, if we use PAWS to detect
491 * old duplicates, we can reduce the interval to bounds required
492 * by RTO, rather than MSL. So, if peer understands PAWS, we
493 * kill tw bucket after 3.5*RTO (it is important that this number
494 * is greater than TS tick!) and detect old duplicates with help
495 * of PAWS.
496 */
497 slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
498
499 spin_lock(&twdr->death_lock);
500
501 /* Unlink it, if it was scheduled */
502 if (inet_twsk_del_dead_node(tw))
503 twdr->tw_count--;
504 else
505 atomic_inc(&tw->tw_refcnt);
506
507 if (slot >= INET_TWDR_RECYCLE_SLOTS) {
508 /* Schedule to slow timer */
509 if (timeo >= TCP_TIMEWAIT_LEN) {
510 slot = INET_TWDR_TWKILL_SLOTS - 1;
511 } else {
512 slot = (timeo + twdr->period - 1) / twdr->period;
513 if (slot >= INET_TWDR_TWKILL_SLOTS)
514 slot = INET_TWDR_TWKILL_SLOTS - 1;
515 }
516 tw->tw_ttd = jiffies + timeo;
517 slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
518 list = &twdr->cells[slot];
519 } else {
520 tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
521
522 if (twdr->twcal_hand < 0) {
523 twdr->twcal_hand = 0;
524 twdr->twcal_jiffie = jiffies;
525 twdr->twcal_timer.expires = twdr->twcal_jiffie +
526 (slot << INET_TWDR_RECYCLE_TICK);
527 add_timer(&twdr->twcal_timer);
528 } else {
529 if (time_after(twdr->twcal_timer.expires,
530 jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
531 mod_timer(&twdr->twcal_timer,
532 jiffies + (slot << INET_TWDR_RECYCLE_TICK));
533 slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
534 }
535 list = &twdr->twcal_row[slot];
536 }
537
538 hlist_add_head(&tw->tw_death_node, list);
539
540 if (twdr->tw_count++ == 0)
541 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
542 spin_unlock(&twdr->death_lock);
543}
544
545void inet_twdr_twcal_tick(unsigned long data)
546{
547 struct inet_timewait_death_row *twdr;
548 int n, slot;
549 unsigned long j;
550 unsigned long now = jiffies;
551 int killed = 0;
552 int adv = 0;
553
554 twdr = (struct inet_timewait_death_row *)data;
555
556 spin_lock(&twdr->death_lock);
557 if (twdr->twcal_hand < 0)
558 goto out;
559
560 slot = twdr->twcal_hand;
561 j = twdr->twcal_jiffie;
562
563 for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
564 if (time_before_eq(j, now)) {
565 struct hlist_node *node, *safe;
566 struct inet_timewait_sock *tw;
567
568 inet_twsk_for_each_inmate_safe(tw, node, safe,
569 &twdr->twcal_row[slot]) {
570 __inet_twsk_del_dead_node(tw);
571 __inet_twsk_kill(tw, twdr->hashinfo);
572 inet_twsk_put(tw);
573 killed++;
574 }
575 } else {
576 if (!adv) {
577 adv = 1;
578 twdr->twcal_jiffie = j;
579 twdr->twcal_hand = slot;
580 }
581
582 if (!hlist_empty(&twdr->twcal_row[slot])) {
583 mod_timer(&twdr->twcal_timer, j);
584 goto out;
585 }
586 }
587 j += 1 << INET_TWDR_RECYCLE_TICK;
588 slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
589 }
590 twdr->twcal_hand = -1;
591
592out:
593 if ((twdr->tw_count -= killed) == 0)
594 del_timer(&twdr->tw_timer);
595 NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
596 spin_unlock(&twdr->death_lock);
597}
598
599/* This is not only more efficient than what we used to do, it eliminates 339/* This is not only more efficient than what we used to do, it eliminates
600 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 340 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
601 * 341 *
@@ -933,4 +673,3 @@ EXPORT_SYMBOL(tcp_check_req);
933EXPORT_SYMBOL(tcp_child_process); 673EXPORT_SYMBOL(tcp_child_process);
934EXPORT_SYMBOL(tcp_create_openreq_child); 674EXPORT_SYMBOL(tcp_create_openreq_child);
935EXPORT_SYMBOL(tcp_timewait_state_process); 675EXPORT_SYMBOL(tcp_timewait_state_process);
936EXPORT_SYMBOL(inet_twsk_deschedule);