diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 402 |
1 files changed, 79 insertions, 323 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 69b1fcf70077..02fdda68718d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -269,13 +269,12 @@ | |||
269 | 269 | ||
270 | int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; | 270 | int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; |
271 | 271 | ||
272 | DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); | 272 | DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; |
273 | |||
274 | kmem_cache_t *tcp_bucket_cachep; | ||
275 | kmem_cache_t *tcp_timewait_cachep; | ||
276 | 273 | ||
277 | atomic_t tcp_orphan_count = ATOMIC_INIT(0); | 274 | atomic_t tcp_orphan_count = ATOMIC_INIT(0); |
278 | 275 | ||
276 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | ||
277 | |||
279 | int sysctl_tcp_mem[3]; | 278 | int sysctl_tcp_mem[3]; |
280 | int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; | 279 | int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; |
281 | int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; | 280 | int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; |
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void) | |||
311 | EXPORT_SYMBOL(tcp_enter_memory_pressure); | 310 | EXPORT_SYMBOL(tcp_enter_memory_pressure); |
312 | 311 | ||
313 | /* | 312 | /* |
314 | * LISTEN is a special case for poll.. | ||
315 | */ | ||
316 | static __inline__ unsigned int tcp_listen_poll(struct sock *sk, | ||
317 | poll_table *wait) | ||
318 | { | ||
319 | return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0; | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * Wait for a TCP event. | 313 | * Wait for a TCP event. |
324 | * | 314 | * |
325 | * Note that we don't need to lock the socket, as the upper poll layers | 315 | * Note that we don't need to lock the socket, as the upper poll layers |
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
334 | 324 | ||
335 | poll_wait(file, sk->sk_sleep, wait); | 325 | poll_wait(file, sk->sk_sleep, wait); |
336 | if (sk->sk_state == TCP_LISTEN) | 326 | if (sk->sk_state == TCP_LISTEN) |
337 | return tcp_listen_poll(sk, wait); | 327 | return inet_csk_listen_poll(sk); |
338 | 328 | ||
339 | /* Socket is not locked. We are protected from async events | 329 | /* Socket is not locked. We are protected from async events |
340 | by poll logic and correct handling of state changes | 330 | by poll logic and correct handling of state changes |
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) | |||
457 | return put_user(answ, (int __user *)arg); | 447 | return put_user(answ, (int __user *)arg); |
458 | } | 448 | } |
459 | 449 | ||
460 | |||
461 | int tcp_listen_start(struct sock *sk) | ||
462 | { | ||
463 | struct inet_sock *inet = inet_sk(sk); | ||
464 | struct tcp_sock *tp = tcp_sk(sk); | ||
465 | int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE); | ||
466 | |||
467 | if (rc != 0) | ||
468 | return rc; | ||
469 | |||
470 | sk->sk_max_ack_backlog = 0; | ||
471 | sk->sk_ack_backlog = 0; | ||
472 | tcp_delack_init(tp); | ||
473 | |||
474 | /* There is race window here: we announce ourselves listening, | ||
475 | * but this transition is still not validated by get_port(). | ||
476 | * It is OK, because this socket enters to hash table only | ||
477 | * after validation is complete. | ||
478 | */ | ||
479 | sk->sk_state = TCP_LISTEN; | ||
480 | if (!sk->sk_prot->get_port(sk, inet->num)) { | ||
481 | inet->sport = htons(inet->num); | ||
482 | |||
483 | sk_dst_reset(sk); | ||
484 | sk->sk_prot->hash(sk); | ||
485 | |||
486 | return 0; | ||
487 | } | ||
488 | |||
489 | sk->sk_state = TCP_CLOSE; | ||
490 | reqsk_queue_destroy(&tp->accept_queue); | ||
491 | return -EADDRINUSE; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * This routine closes sockets which have been at least partially | ||
496 | * opened, but not yet accepted. | ||
497 | */ | ||
498 | |||
499 | static void tcp_listen_stop (struct sock *sk) | ||
500 | { | ||
501 | struct tcp_sock *tp = tcp_sk(sk); | ||
502 | struct listen_sock *lopt; | ||
503 | struct request_sock *acc_req; | ||
504 | struct request_sock *req; | ||
505 | int i; | ||
506 | |||
507 | tcp_delete_keepalive_timer(sk); | ||
508 | |||
509 | /* make all the listen_opt local to us */ | ||
510 | lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue); | ||
511 | acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); | ||
512 | |||
513 | if (lopt->qlen) { | ||
514 | for (i = 0; i < TCP_SYNQ_HSIZE; i++) { | ||
515 | while ((req = lopt->syn_table[i]) != NULL) { | ||
516 | lopt->syn_table[i] = req->dl_next; | ||
517 | lopt->qlen--; | ||
518 | reqsk_free(req); | ||
519 | |||
520 | /* Following specs, it would be better either to send FIN | ||
521 | * (and enter FIN-WAIT-1, it is normal close) | ||
522 | * or to send active reset (abort). | ||
523 | * Certainly, it is pretty dangerous while synflood, but it is | ||
524 | * bad justification for our negligence 8) | ||
525 | * To be honest, we are not able to make either | ||
526 | * of the variants now. --ANK | ||
527 | */ | ||
528 | } | ||
529 | } | ||
530 | } | ||
531 | BUG_TRAP(!lopt->qlen); | ||
532 | |||
533 | kfree(lopt); | ||
534 | |||
535 | while ((req = acc_req) != NULL) { | ||
536 | struct sock *child = req->sk; | ||
537 | |||
538 | acc_req = req->dl_next; | ||
539 | |||
540 | local_bh_disable(); | ||
541 | bh_lock_sock(child); | ||
542 | BUG_TRAP(!sock_owned_by_user(child)); | ||
543 | sock_hold(child); | ||
544 | |||
545 | tcp_disconnect(child, O_NONBLOCK); | ||
546 | |||
547 | sock_orphan(child); | ||
548 | |||
549 | atomic_inc(&tcp_orphan_count); | ||
550 | |||
551 | tcp_destroy_sock(child); | ||
552 | |||
553 | bh_unlock_sock(child); | ||
554 | local_bh_enable(); | ||
555 | sock_put(child); | ||
556 | |||
557 | sk_acceptq_removed(sk); | ||
558 | __reqsk_free(req); | ||
559 | } | ||
560 | BUG_TRAP(!sk->sk_ack_backlog); | ||
561 | } | ||
562 | |||
563 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) | 450 | static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) |
564 | { | 451 | { |
565 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; | 452 | TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; |
@@ -975,7 +862,7 @@ do_fault: | |||
975 | if (!skb->len) { | 862 | if (!skb->len) { |
976 | if (sk->sk_send_head == skb) | 863 | if (sk->sk_send_head == skb) |
977 | sk->sk_send_head = NULL; | 864 | sk->sk_send_head = NULL; |
978 | __skb_unlink(skb, skb->list); | 865 | __skb_unlink(skb, &sk->sk_write_queue); |
979 | sk_stream_free_skb(sk, skb); | 866 | sk_stream_free_skb(sk, skb); |
980 | } | 867 | } |
981 | 868 | ||
@@ -1057,20 +944,21 @@ static void cleanup_rbuf(struct sock *sk, int copied) | |||
1057 | BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); | 944 | BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); |
1058 | #endif | 945 | #endif |
1059 | 946 | ||
1060 | if (tcp_ack_scheduled(tp)) { | 947 | if (inet_csk_ack_scheduled(sk)) { |
948 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1061 | /* Delayed ACKs frequently hit locked sockets during bulk | 949 | /* Delayed ACKs frequently hit locked sockets during bulk |
1062 | * receive. */ | 950 | * receive. */ |
1063 | if (tp->ack.blocked || | 951 | if (icsk->icsk_ack.blocked || |
1064 | /* Once-per-two-segments ACK was not sent by tcp_input.c */ | 952 | /* Once-per-two-segments ACK was not sent by tcp_input.c */ |
1065 | tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || | 953 | tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || |
1066 | /* | 954 | /* |
1067 | * If this read emptied read buffer, we send ACK, if | 955 | * If this read emptied read buffer, we send ACK, if |
1068 | * connection is not bidirectional, user drained | 956 | * connection is not bidirectional, user drained |
1069 | * receive buffer and there was a small segment | 957 | * receive buffer and there was a small segment |
1070 | * in queue. | 958 | * in queue. |
1071 | */ | 959 | */ |
1072 | (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && | 960 | (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && |
1073 | !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) | 961 | !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) |
1074 | time_to_ack = 1; | 962 | time_to_ack = 1; |
1075 | } | 963 | } |
1076 | 964 | ||
@@ -1572,40 +1460,6 @@ void tcp_shutdown(struct sock *sk, int how) | |||
1572 | } | 1460 | } |
1573 | } | 1461 | } |
1574 | 1462 | ||
1575 | /* | ||
1576 | * At this point, there should be no process reference to this | ||
1577 | * socket, and thus no user references at all. Therefore we | ||
1578 | * can assume the socket waitqueue is inactive and nobody will | ||
1579 | * try to jump onto it. | ||
1580 | */ | ||
1581 | void tcp_destroy_sock(struct sock *sk) | ||
1582 | { | ||
1583 | BUG_TRAP(sk->sk_state == TCP_CLOSE); | ||
1584 | BUG_TRAP(sock_flag(sk, SOCK_DEAD)); | ||
1585 | |||
1586 | /* It cannot be in hash table! */ | ||
1587 | BUG_TRAP(sk_unhashed(sk)); | ||
1588 | |||
1589 | /* If it has not 0 inet_sk(sk)->num, it must be bound */ | ||
1590 | BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); | ||
1591 | |||
1592 | sk->sk_prot->destroy(sk); | ||
1593 | |||
1594 | sk_stream_kill_queues(sk); | ||
1595 | |||
1596 | xfrm_sk_free_policy(sk); | ||
1597 | |||
1598 | #ifdef INET_REFCNT_DEBUG | ||
1599 | if (atomic_read(&sk->sk_refcnt) != 1) { | ||
1600 | printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", | ||
1601 | sk, atomic_read(&sk->sk_refcnt)); | ||
1602 | } | ||
1603 | #endif | ||
1604 | |||
1605 | atomic_dec(&tcp_orphan_count); | ||
1606 | sock_put(sk); | ||
1607 | } | ||
1608 | |||
1609 | void tcp_close(struct sock *sk, long timeout) | 1463 | void tcp_close(struct sock *sk, long timeout) |
1610 | { | 1464 | { |
1611 | struct sk_buff *skb; | 1465 | struct sk_buff *skb; |
@@ -1618,7 +1472,7 @@ void tcp_close(struct sock *sk, long timeout) | |||
1618 | tcp_set_state(sk, TCP_CLOSE); | 1472 | tcp_set_state(sk, TCP_CLOSE); |
1619 | 1473 | ||
1620 | /* Special case. */ | 1474 | /* Special case. */ |
1621 | tcp_listen_stop(sk); | 1475 | inet_csk_listen_stop(sk); |
1622 | 1476 | ||
1623 | goto adjudge_to_death; | 1477 | goto adjudge_to_death; |
1624 | } | 1478 | } |
@@ -1721,12 +1575,12 @@ adjudge_to_death: | |||
1721 | tcp_send_active_reset(sk, GFP_ATOMIC); | 1575 | tcp_send_active_reset(sk, GFP_ATOMIC); |
1722 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); | 1576 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); |
1723 | } else { | 1577 | } else { |
1724 | int tmo = tcp_fin_time(tp); | 1578 | const int tmo = tcp_fin_time(sk); |
1725 | 1579 | ||
1726 | if (tmo > TCP_TIMEWAIT_LEN) { | 1580 | if (tmo > TCP_TIMEWAIT_LEN) { |
1727 | tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); | 1581 | inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); |
1728 | } else { | 1582 | } else { |
1729 | atomic_inc(&tcp_orphan_count); | 1583 | atomic_inc(sk->sk_prot->orphan_count); |
1730 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); | 1584 | tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); |
1731 | goto out; | 1585 | goto out; |
1732 | } | 1586 | } |
@@ -1734,7 +1588,7 @@ adjudge_to_death: | |||
1734 | } | 1588 | } |
1735 | if (sk->sk_state != TCP_CLOSE) { | 1589 | if (sk->sk_state != TCP_CLOSE) { |
1736 | sk_stream_mem_reclaim(sk); | 1590 | sk_stream_mem_reclaim(sk); |
1737 | if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || | 1591 | if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || |
1738 | (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && | 1592 | (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && |
1739 | atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { | 1593 | atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { |
1740 | if (net_ratelimit()) | 1594 | if (net_ratelimit()) |
@@ -1745,10 +1599,10 @@ adjudge_to_death: | |||
1745 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); | 1599 | NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); |
1746 | } | 1600 | } |
1747 | } | 1601 | } |
1748 | atomic_inc(&tcp_orphan_count); | 1602 | atomic_inc(sk->sk_prot->orphan_count); |
1749 | 1603 | ||
1750 | if (sk->sk_state == TCP_CLOSE) | 1604 | if (sk->sk_state == TCP_CLOSE) |
1751 | tcp_destroy_sock(sk); | 1605 | inet_csk_destroy_sock(sk); |
1752 | /* Otherwise, socket is reprieved until protocol close. */ | 1606 | /* Otherwise, socket is reprieved until protocol close. */ |
1753 | 1607 | ||
1754 | out: | 1608 | out: |
@@ -1769,6 +1623,7 @@ static inline int tcp_need_reset(int state) | |||
1769 | int tcp_disconnect(struct sock *sk, int flags) | 1623 | int tcp_disconnect(struct sock *sk, int flags) |
1770 | { | 1624 | { |
1771 | struct inet_sock *inet = inet_sk(sk); | 1625 | struct inet_sock *inet = inet_sk(sk); |
1626 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1772 | struct tcp_sock *tp = tcp_sk(sk); | 1627 | struct tcp_sock *tp = tcp_sk(sk); |
1773 | int err = 0; | 1628 | int err = 0; |
1774 | int old_state = sk->sk_state; | 1629 | int old_state = sk->sk_state; |
@@ -1778,7 +1633,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1778 | 1633 | ||
1779 | /* ABORT function of RFC793 */ | 1634 | /* ABORT function of RFC793 */ |
1780 | if (old_state == TCP_LISTEN) { | 1635 | if (old_state == TCP_LISTEN) { |
1781 | tcp_listen_stop(sk); | 1636 | inet_csk_listen_stop(sk); |
1782 | } else if (tcp_need_reset(old_state) || | 1637 | } else if (tcp_need_reset(old_state) || |
1783 | (tp->snd_nxt != tp->write_seq && | 1638 | (tp->snd_nxt != tp->write_seq && |
1784 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { | 1639 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { |
@@ -1805,125 +1660,34 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1805 | tp->srtt = 0; | 1660 | tp->srtt = 0; |
1806 | if ((tp->write_seq += tp->max_window + 2) == 0) | 1661 | if ((tp->write_seq += tp->max_window + 2) == 0) |
1807 | tp->write_seq = 1; | 1662 | tp->write_seq = 1; |
1808 | tp->backoff = 0; | 1663 | icsk->icsk_backoff = 0; |
1809 | tp->snd_cwnd = 2; | 1664 | tp->snd_cwnd = 2; |
1810 | tp->probes_out = 0; | 1665 | icsk->icsk_probes_out = 0; |
1811 | tp->packets_out = 0; | 1666 | tp->packets_out = 0; |
1812 | tp->snd_ssthresh = 0x7fffffff; | 1667 | tp->snd_ssthresh = 0x7fffffff; |
1813 | tp->snd_cwnd_cnt = 0; | 1668 | tp->snd_cwnd_cnt = 0; |
1814 | tcp_set_ca_state(tp, TCP_CA_Open); | 1669 | tcp_set_ca_state(sk, TCP_CA_Open); |
1815 | tcp_clear_retrans(tp); | 1670 | tcp_clear_retrans(tp); |
1816 | tcp_delack_init(tp); | 1671 | inet_csk_delack_init(sk); |
1817 | sk->sk_send_head = NULL; | 1672 | sk->sk_send_head = NULL; |
1818 | tp->rx_opt.saw_tstamp = 0; | 1673 | tp->rx_opt.saw_tstamp = 0; |
1819 | tcp_sack_reset(&tp->rx_opt); | 1674 | tcp_sack_reset(&tp->rx_opt); |
1820 | __sk_dst_reset(sk); | 1675 | __sk_dst_reset(sk); |
1821 | 1676 | ||
1822 | BUG_TRAP(!inet->num || tp->bind_hash); | 1677 | BUG_TRAP(!inet->num || icsk->icsk_bind_hash); |
1823 | 1678 | ||
1824 | sk->sk_error_report(sk); | 1679 | sk->sk_error_report(sk); |
1825 | return err; | 1680 | return err; |
1826 | } | 1681 | } |
1827 | 1682 | ||
1828 | /* | 1683 | /* |
1829 | * Wait for an incoming connection, avoid race | ||
1830 | * conditions. This must be called with the socket locked. | ||
1831 | */ | ||
1832 | static int wait_for_connect(struct sock *sk, long timeo) | ||
1833 | { | ||
1834 | struct tcp_sock *tp = tcp_sk(sk); | ||
1835 | DEFINE_WAIT(wait); | ||
1836 | int err; | ||
1837 | |||
1838 | /* | ||
1839 | * True wake-one mechanism for incoming connections: only | ||
1840 | * one process gets woken up, not the 'whole herd'. | ||
1841 | * Since we do not 'race & poll' for established sockets | ||
1842 | * anymore, the common case will execute the loop only once. | ||
1843 | * | ||
1844 | * Subtle issue: "add_wait_queue_exclusive()" will be added | ||
1845 | * after any current non-exclusive waiters, and we know that | ||
1846 | * it will always _stay_ after any new non-exclusive waiters | ||
1847 | * because all non-exclusive waiters are added at the | ||
1848 | * beginning of the wait-queue. As such, it's ok to "drop" | ||
1849 | * our exclusiveness temporarily when we get woken up without | ||
1850 | * having to remove and re-insert us on the wait queue. | ||
1851 | */ | ||
1852 | for (;;) { | ||
1853 | prepare_to_wait_exclusive(sk->sk_sleep, &wait, | ||
1854 | TASK_INTERRUPTIBLE); | ||
1855 | release_sock(sk); | ||
1856 | if (reqsk_queue_empty(&tp->accept_queue)) | ||
1857 | timeo = schedule_timeout(timeo); | ||
1858 | lock_sock(sk); | ||
1859 | err = 0; | ||
1860 | if (!reqsk_queue_empty(&tp->accept_queue)) | ||
1861 | break; | ||
1862 | err = -EINVAL; | ||
1863 | if (sk->sk_state != TCP_LISTEN) | ||
1864 | break; | ||
1865 | err = sock_intr_errno(timeo); | ||
1866 | if (signal_pending(current)) | ||
1867 | break; | ||
1868 | err = -EAGAIN; | ||
1869 | if (!timeo) | ||
1870 | break; | ||
1871 | } | ||
1872 | finish_wait(sk->sk_sleep, &wait); | ||
1873 | return err; | ||
1874 | } | ||
1875 | |||
1876 | /* | ||
1877 | * This will accept the next outstanding connection. | ||
1878 | */ | ||
1879 | |||
1880 | struct sock *tcp_accept(struct sock *sk, int flags, int *err) | ||
1881 | { | ||
1882 | struct tcp_sock *tp = tcp_sk(sk); | ||
1883 | struct sock *newsk; | ||
1884 | int error; | ||
1885 | |||
1886 | lock_sock(sk); | ||
1887 | |||
1888 | /* We need to make sure that this socket is listening, | ||
1889 | * and that it has something pending. | ||
1890 | */ | ||
1891 | error = -EINVAL; | ||
1892 | if (sk->sk_state != TCP_LISTEN) | ||
1893 | goto out_err; | ||
1894 | |||
1895 | /* Find already established connection */ | ||
1896 | if (reqsk_queue_empty(&tp->accept_queue)) { | ||
1897 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); | ||
1898 | |||
1899 | /* If this is a non blocking socket don't sleep */ | ||
1900 | error = -EAGAIN; | ||
1901 | if (!timeo) | ||
1902 | goto out_err; | ||
1903 | |||
1904 | error = wait_for_connect(sk, timeo); | ||
1905 | if (error) | ||
1906 | goto out_err; | ||
1907 | } | ||
1908 | |||
1909 | newsk = reqsk_queue_get_child(&tp->accept_queue, sk); | ||
1910 | BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); | ||
1911 | out: | ||
1912 | release_sock(sk); | ||
1913 | return newsk; | ||
1914 | out_err: | ||
1915 | newsk = NULL; | ||
1916 | *err = error; | ||
1917 | goto out; | ||
1918 | } | ||
1919 | |||
1920 | /* | ||
1921 | * Socket option code for TCP. | 1684 | * Socket option code for TCP. |
1922 | */ | 1685 | */ |
1923 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | 1686 | int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, |
1924 | int optlen) | 1687 | int optlen) |
1925 | { | 1688 | { |
1926 | struct tcp_sock *tp = tcp_sk(sk); | 1689 | struct tcp_sock *tp = tcp_sk(sk); |
1690 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
1927 | int val; | 1691 | int val; |
1928 | int err = 0; | 1692 | int err = 0; |
1929 | 1693 | ||
@@ -1945,7 +1709,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
1945 | name[val] = 0; | 1709 | name[val] = 0; |
1946 | 1710 | ||
1947 | lock_sock(sk); | 1711 | lock_sock(sk); |
1948 | err = tcp_set_congestion_control(tp, name); | 1712 | err = tcp_set_congestion_control(sk, name); |
1949 | release_sock(sk); | 1713 | release_sock(sk); |
1950 | return err; | 1714 | return err; |
1951 | } | 1715 | } |
@@ -2022,7 +1786,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2022 | elapsed = tp->keepalive_time - elapsed; | 1786 | elapsed = tp->keepalive_time - elapsed; |
2023 | else | 1787 | else |
2024 | elapsed = 0; | 1788 | elapsed = 0; |
2025 | tcp_reset_keepalive_timer(sk, elapsed); | 1789 | inet_csk_reset_keepalive_timer(sk, elapsed); |
2026 | } | 1790 | } |
2027 | } | 1791 | } |
2028 | break; | 1792 | break; |
@@ -2042,7 +1806,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2042 | if (val < 1 || val > MAX_TCP_SYNCNT) | 1806 | if (val < 1 || val > MAX_TCP_SYNCNT) |
2043 | err = -EINVAL; | 1807 | err = -EINVAL; |
2044 | else | 1808 | else |
2045 | tp->syn_retries = val; | 1809 | icsk->icsk_syn_retries = val; |
2046 | break; | 1810 | break; |
2047 | 1811 | ||
2048 | case TCP_LINGER2: | 1812 | case TCP_LINGER2: |
@@ -2055,15 +1819,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2055 | break; | 1819 | break; |
2056 | 1820 | ||
2057 | case TCP_DEFER_ACCEPT: | 1821 | case TCP_DEFER_ACCEPT: |
2058 | tp->defer_accept = 0; | 1822 | icsk->icsk_accept_queue.rskq_defer_accept = 0; |
2059 | if (val > 0) { | 1823 | if (val > 0) { |
2060 | /* Translate value in seconds to number of | 1824 | /* Translate value in seconds to number of |
2061 | * retransmits */ | 1825 | * retransmits */ |
2062 | while (tp->defer_accept < 32 && | 1826 | while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && |
2063 | val > ((TCP_TIMEOUT_INIT / HZ) << | 1827 | val > ((TCP_TIMEOUT_INIT / HZ) << |
2064 | tp->defer_accept)) | 1828 | icsk->icsk_accept_queue.rskq_defer_accept)) |
2065 | tp->defer_accept++; | 1829 | icsk->icsk_accept_queue.rskq_defer_accept++; |
2066 | tp->defer_accept++; | 1830 | icsk->icsk_accept_queue.rskq_defer_accept++; |
2067 | } | 1831 | } |
2068 | break; | 1832 | break; |
2069 | 1833 | ||
@@ -2081,16 +1845,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2081 | 1845 | ||
2082 | case TCP_QUICKACK: | 1846 | case TCP_QUICKACK: |
2083 | if (!val) { | 1847 | if (!val) { |
2084 | tp->ack.pingpong = 1; | 1848 | icsk->icsk_ack.pingpong = 1; |
2085 | } else { | 1849 | } else { |
2086 | tp->ack.pingpong = 0; | 1850 | icsk->icsk_ack.pingpong = 0; |
2087 | if ((1 << sk->sk_state) & | 1851 | if ((1 << sk->sk_state) & |
2088 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && | 1852 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
2089 | tcp_ack_scheduled(tp)) { | 1853 | inet_csk_ack_scheduled(sk)) { |
2090 | tp->ack.pending |= TCP_ACK_PUSHED; | 1854 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
2091 | cleanup_rbuf(sk, 1); | 1855 | cleanup_rbuf(sk, 1); |
2092 | if (!(val & 1)) | 1856 | if (!(val & 1)) |
2093 | tp->ack.pingpong = 1; | 1857 | icsk->icsk_ack.pingpong = 1; |
2094 | } | 1858 | } |
2095 | } | 1859 | } |
2096 | break; | 1860 | break; |
@@ -2107,15 +1871,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2107 | void tcp_get_info(struct sock *sk, struct tcp_info *info) | 1871 | void tcp_get_info(struct sock *sk, struct tcp_info *info) |
2108 | { | 1872 | { |
2109 | struct tcp_sock *tp = tcp_sk(sk); | 1873 | struct tcp_sock *tp = tcp_sk(sk); |
1874 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
2110 | u32 now = tcp_time_stamp; | 1875 | u32 now = tcp_time_stamp; |
2111 | 1876 | ||
2112 | memset(info, 0, sizeof(*info)); | 1877 | memset(info, 0, sizeof(*info)); |
2113 | 1878 | ||
2114 | info->tcpi_state = sk->sk_state; | 1879 | info->tcpi_state = sk->sk_state; |
2115 | info->tcpi_ca_state = tp->ca_state; | 1880 | info->tcpi_ca_state = icsk->icsk_ca_state; |
2116 | info->tcpi_retransmits = tp->retransmits; | 1881 | info->tcpi_retransmits = icsk->icsk_retransmits; |
2117 | info->tcpi_probes = tp->probes_out; | 1882 | info->tcpi_probes = icsk->icsk_probes_out; |
2118 | info->tcpi_backoff = tp->backoff; | 1883 | info->tcpi_backoff = icsk->icsk_backoff; |
2119 | 1884 | ||
2120 | if (tp->rx_opt.tstamp_ok) | 1885 | if (tp->rx_opt.tstamp_ok) |
2121 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; | 1886 | info->tcpi_options |= TCPI_OPT_TIMESTAMPS; |
@@ -2130,10 +1895,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) | |||
2130 | if (tp->ecn_flags&TCP_ECN_OK) | 1895 | if (tp->ecn_flags&TCP_ECN_OK) |
2131 | info->tcpi_options |= TCPI_OPT_ECN; | 1896 | info->tcpi_options |= TCPI_OPT_ECN; |
2132 | 1897 | ||
2133 | info->tcpi_rto = jiffies_to_usecs(tp->rto); | 1898 | info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); |
2134 | info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); | 1899 | info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); |
2135 | info->tcpi_snd_mss = tp->mss_cache; | 1900 | info->tcpi_snd_mss = tp->mss_cache; |
2136 | info->tcpi_rcv_mss = tp->ack.rcv_mss; | 1901 | info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; |
2137 | 1902 | ||
2138 | info->tcpi_unacked = tp->packets_out; | 1903 | info->tcpi_unacked = tp->packets_out; |
2139 | info->tcpi_sacked = tp->sacked_out; | 1904 | info->tcpi_sacked = tp->sacked_out; |
@@ -2142,7 +1907,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) | |||
2142 | info->tcpi_fackets = tp->fackets_out; | 1907 | info->tcpi_fackets = tp->fackets_out; |
2143 | 1908 | ||
2144 | info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); | 1909 | info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); |
2145 | info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); | 1910 | info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); |
2146 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); | 1911 | info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); |
2147 | 1912 | ||
2148 | info->tcpi_pmtu = tp->pmtu_cookie; | 1913 | info->tcpi_pmtu = tp->pmtu_cookie; |
@@ -2165,6 +1930,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info); | |||
2165 | int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | 1930 | int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, |
2166 | int __user *optlen) | 1931 | int __user *optlen) |
2167 | { | 1932 | { |
1933 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
2168 | struct tcp_sock *tp = tcp_sk(sk); | 1934 | struct tcp_sock *tp = tcp_sk(sk); |
2169 | int val, len; | 1935 | int val, len; |
2170 | 1936 | ||
@@ -2202,7 +1968,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2202 | val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; | 1968 | val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; |
2203 | break; | 1969 | break; |
2204 | case TCP_SYNCNT: | 1970 | case TCP_SYNCNT: |
2205 | val = tp->syn_retries ? : sysctl_tcp_syn_retries; | 1971 | val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
2206 | break; | 1972 | break; |
2207 | case TCP_LINGER2: | 1973 | case TCP_LINGER2: |
2208 | val = tp->linger2; | 1974 | val = tp->linger2; |
@@ -2210,8 +1976,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2210 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; | 1976 | val = (val ? : sysctl_tcp_fin_timeout) / HZ; |
2211 | break; | 1977 | break; |
2212 | case TCP_DEFER_ACCEPT: | 1978 | case TCP_DEFER_ACCEPT: |
2213 | val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << | 1979 | val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : |
2214 | (tp->defer_accept - 1)); | 1980 | ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); |
2215 | break; | 1981 | break; |
2216 | case TCP_WINDOW_CLAMP: | 1982 | case TCP_WINDOW_CLAMP: |
2217 | val = tp->window_clamp; | 1983 | val = tp->window_clamp; |
@@ -2232,7 +1998,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2232 | return 0; | 1998 | return 0; |
2233 | } | 1999 | } |
2234 | case TCP_QUICKACK: | 2000 | case TCP_QUICKACK: |
2235 | val = !tp->ack.pingpong; | 2001 | val = !icsk->icsk_ack.pingpong; |
2236 | break; | 2002 | break; |
2237 | 2003 | ||
2238 | case TCP_CONGESTION: | 2004 | case TCP_CONGESTION: |
@@ -2241,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, | |||
2241 | len = min_t(unsigned int, len, TCP_CA_NAME_MAX); | 2007 | len = min_t(unsigned int, len, TCP_CA_NAME_MAX); |
2242 | if (put_user(len, optlen)) | 2008 | if (put_user(len, optlen)) |
2243 | return -EFAULT; | 2009 | return -EFAULT; |
2244 | if (copy_to_user(optval, tp->ca_ops->name, len)) | 2010 | if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) |
2245 | return -EFAULT; | 2011 | return -EFAULT; |
2246 | return 0; | 2012 | return 0; |
2247 | default: | 2013 | default: |
@@ -2278,79 +2044,72 @@ void __init tcp_init(void) | |||
2278 | __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), | 2044 | __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), |
2279 | sizeof(skb->cb)); | 2045 | sizeof(skb->cb)); |
2280 | 2046 | ||
2281 | tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", | 2047 | tcp_hashinfo.bind_bucket_cachep = |
2282 | sizeof(struct tcp_bind_bucket), | 2048 | kmem_cache_create("tcp_bind_bucket", |
2283 | 0, SLAB_HWCACHE_ALIGN, | 2049 | sizeof(struct inet_bind_bucket), 0, |
2284 | NULL, NULL); | 2050 | SLAB_HWCACHE_ALIGN, NULL, NULL); |
2285 | if (!tcp_bucket_cachep) | 2051 | if (!tcp_hashinfo.bind_bucket_cachep) |
2286 | panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); | 2052 | panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); |
2287 | 2053 | ||
2288 | tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", | ||
2289 | sizeof(struct tcp_tw_bucket), | ||
2290 | 0, SLAB_HWCACHE_ALIGN, | ||
2291 | NULL, NULL); | ||
2292 | if (!tcp_timewait_cachep) | ||
2293 | panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); | ||
2294 | |||
2295 | /* Size and allocate the main established and bind bucket | 2054 | /* Size and allocate the main established and bind bucket |
2296 | * hash tables. | 2055 | * hash tables. |
2297 | * | 2056 | * |
2298 | * The methodology is similar to that of the buffer cache. | 2057 | * The methodology is similar to that of the buffer cache. |
2299 | */ | 2058 | */ |
2300 | tcp_ehash = (struct tcp_ehash_bucket *) | 2059 | tcp_hashinfo.ehash = |
2301 | alloc_large_system_hash("TCP established", | 2060 | alloc_large_system_hash("TCP established", |
2302 | sizeof(struct tcp_ehash_bucket), | 2061 | sizeof(struct inet_ehash_bucket), |
2303 | thash_entries, | 2062 | thash_entries, |
2304 | (num_physpages >= 128 * 1024) ? | 2063 | (num_physpages >= 128 * 1024) ? |
2305 | (25 - PAGE_SHIFT) : | 2064 | (25 - PAGE_SHIFT) : |
2306 | (27 - PAGE_SHIFT), | 2065 | (27 - PAGE_SHIFT), |
2307 | HASH_HIGHMEM, | 2066 | HASH_HIGHMEM, |
2308 | &tcp_ehash_size, | 2067 | &tcp_hashinfo.ehash_size, |
2309 | NULL, | 2068 | NULL, |
2310 | 0); | 2069 | 0); |
2311 | tcp_ehash_size = (1 << tcp_ehash_size) >> 1; | 2070 | tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; |
2312 | for (i = 0; i < (tcp_ehash_size << 1); i++) { | 2071 | for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { |
2313 | rwlock_init(&tcp_ehash[i].lock); | 2072 | rwlock_init(&tcp_hashinfo.ehash[i].lock); |
2314 | INIT_HLIST_HEAD(&tcp_ehash[i].chain); | 2073 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); |
2315 | } | 2074 | } |
2316 | 2075 | ||
2317 | tcp_bhash = (struct tcp_bind_hashbucket *) | 2076 | tcp_hashinfo.bhash = |
2318 | alloc_large_system_hash("TCP bind", | 2077 | alloc_large_system_hash("TCP bind", |
2319 | sizeof(struct tcp_bind_hashbucket), | 2078 | sizeof(struct inet_bind_hashbucket), |
2320 | tcp_ehash_size, | 2079 | tcp_hashinfo.ehash_size, |
2321 | (num_physpages >= 128 * 1024) ? | 2080 | (num_physpages >= 128 * 1024) ? |
2322 | (25 - PAGE_SHIFT) : | 2081 | (25 - PAGE_SHIFT) : |
2323 | (27 - PAGE_SHIFT), | 2082 | (27 - PAGE_SHIFT), |
2324 | HASH_HIGHMEM, | 2083 | HASH_HIGHMEM, |
2325 | &tcp_bhash_size, | 2084 | &tcp_hashinfo.bhash_size, |
2326 | NULL, | 2085 | NULL, |
2327 | 64 * 1024); | 2086 | 64 * 1024); |
2328 | tcp_bhash_size = 1 << tcp_bhash_size; | 2087 | tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; |
2329 | for (i = 0; i < tcp_bhash_size; i++) { | 2088 | for (i = 0; i < tcp_hashinfo.bhash_size; i++) { |
2330 | spin_lock_init(&tcp_bhash[i].lock); | 2089 | spin_lock_init(&tcp_hashinfo.bhash[i].lock); |
2331 | INIT_HLIST_HEAD(&tcp_bhash[i].chain); | 2090 | INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); |
2332 | } | 2091 | } |
2333 | 2092 | ||
2334 | /* Try to be a bit smarter and adjust defaults depending | 2093 | /* Try to be a bit smarter and adjust defaults depending |
2335 | * on available memory. | 2094 | * on available memory. |
2336 | */ | 2095 | */ |
2337 | for (order = 0; ((1 << order) << PAGE_SHIFT) < | 2096 | for (order = 0; ((1 << order) << PAGE_SHIFT) < |
2338 | (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); | 2097 | (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); |
2339 | order++) | 2098 | order++) |
2340 | ; | 2099 | ; |
2341 | if (order >= 4) { | 2100 | if (order >= 4) { |
2342 | sysctl_local_port_range[0] = 32768; | 2101 | sysctl_local_port_range[0] = 32768; |
2343 | sysctl_local_port_range[1] = 61000; | 2102 | sysctl_local_port_range[1] = 61000; |
2344 | sysctl_tcp_max_tw_buckets = 180000; | 2103 | tcp_death_row.sysctl_max_tw_buckets = 180000; |
2345 | sysctl_tcp_max_orphans = 4096 << (order - 4); | 2104 | sysctl_tcp_max_orphans = 4096 << (order - 4); |
2346 | sysctl_max_syn_backlog = 1024; | 2105 | sysctl_max_syn_backlog = 1024; |
2347 | } else if (order < 3) { | 2106 | } else if (order < 3) { |
2348 | sysctl_local_port_range[0] = 1024 * (3 - order); | 2107 | sysctl_local_port_range[0] = 1024 * (3 - order); |
2349 | sysctl_tcp_max_tw_buckets >>= (3 - order); | 2108 | tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); |
2350 | sysctl_tcp_max_orphans >>= (3 - order); | 2109 | sysctl_tcp_max_orphans >>= (3 - order); |
2351 | sysctl_max_syn_backlog = 128; | 2110 | sysctl_max_syn_backlog = 128; |
2352 | } | 2111 | } |
2353 | tcp_port_rover = sysctl_local_port_range[0] - 1; | 2112 | tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; |
2354 | 2113 | ||
2355 | sysctl_tcp_mem[0] = 768 << order; | 2114 | sysctl_tcp_mem[0] = 768 << order; |
2356 | sysctl_tcp_mem[1] = 1024 << order; | 2115 | sysctl_tcp_mem[1] = 1024 << order; |
@@ -2365,14 +2124,12 @@ void __init tcp_init(void) | |||
2365 | 2124 | ||
2366 | printk(KERN_INFO "TCP: Hash tables configured " | 2125 | printk(KERN_INFO "TCP: Hash tables configured " |
2367 | "(established %d bind %d)\n", | 2126 | "(established %d bind %d)\n", |
2368 | tcp_ehash_size << 1, tcp_bhash_size); | 2127 | tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); |
2369 | 2128 | ||
2370 | tcp_register_congestion_control(&tcp_reno); | 2129 | tcp_register_congestion_control(&tcp_reno); |
2371 | } | 2130 | } |
2372 | 2131 | ||
2373 | EXPORT_SYMBOL(tcp_accept); | ||
2374 | EXPORT_SYMBOL(tcp_close); | 2132 | EXPORT_SYMBOL(tcp_close); |
2375 | EXPORT_SYMBOL(tcp_destroy_sock); | ||
2376 | EXPORT_SYMBOL(tcp_disconnect); | 2133 | EXPORT_SYMBOL(tcp_disconnect); |
2377 | EXPORT_SYMBOL(tcp_getsockopt); | 2134 | EXPORT_SYMBOL(tcp_getsockopt); |
2378 | EXPORT_SYMBOL(tcp_ioctl); | 2135 | EXPORT_SYMBOL(tcp_ioctl); |
@@ -2384,4 +2141,3 @@ EXPORT_SYMBOL(tcp_sendpage); | |||
2384 | EXPORT_SYMBOL(tcp_setsockopt); | 2141 | EXPORT_SYMBOL(tcp_setsockopt); |
2385 | EXPORT_SYMBOL(tcp_shutdown); | 2142 | EXPORT_SYMBOL(tcp_shutdown); |
2386 | EXPORT_SYMBOL(tcp_statistics); | 2143 | EXPORT_SYMBOL(tcp_statistics); |
2387 | EXPORT_SYMBOL(tcp_timewait_cachep); | ||