diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 450 |
1 files changed, 385 insertions, 65 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1813bc71088..296150b2a62f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -264,6 +264,8 @@ | |||
264 | #include <linux/cache.h> | 264 | #include <linux/cache.h> |
265 | #include <linux/err.h> | 265 | #include <linux/err.h> |
266 | #include <linux/crypto.h> | 266 | #include <linux/crypto.h> |
267 | #include <linux/time.h> | ||
268 | #include <linux/slab.h> | ||
267 | 269 | ||
268 | #include <net/icmp.h> | 270 | #include <net/icmp.h> |
269 | #include <net/tcp.h> | 271 | #include <net/tcp.h> |
@@ -428,7 +430,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
428 | if (tp->urg_seq == tp->copied_seq && | 430 | if (tp->urg_seq == tp->copied_seq && |
429 | !sock_flag(sk, SOCK_URGINLINE) && | 431 | !sock_flag(sk, SOCK_URGINLINE) && |
430 | tp->urg_data) | 432 | tp->urg_data) |
431 | target--; | 433 | target++; |
432 | 434 | ||
433 | /* Potential race condition. If read of tp below will | 435 | /* Potential race condition. If read of tp below will |
434 | * escape above sk->sk_state, we can be illegally awaken | 436 | * escape above sk->sk_state, we can be illegally awaken |
@@ -535,8 +537,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) | |||
535 | tp->nonagle &= ~TCP_NAGLE_PUSH; | 537 | tp->nonagle &= ~TCP_NAGLE_PUSH; |
536 | } | 538 | } |
537 | 539 | ||
538 | static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, | 540 | static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) |
539 | struct sk_buff *skb) | ||
540 | { | 541 | { |
541 | if (flags & MSG_OOB) | 542 | if (flags & MSG_OOB) |
542 | tp->snd_up = tp->write_seq; | 543 | tp->snd_up = tp->write_seq; |
@@ -545,13 +546,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, | |||
545 | static inline void tcp_push(struct sock *sk, int flags, int mss_now, | 546 | static inline void tcp_push(struct sock *sk, int flags, int mss_now, |
546 | int nonagle) | 547 | int nonagle) |
547 | { | 548 | { |
548 | struct tcp_sock *tp = tcp_sk(sk); | ||
549 | |||
550 | if (tcp_send_head(sk)) { | 549 | if (tcp_send_head(sk)) { |
551 | struct sk_buff *skb = tcp_write_queue_tail(sk); | 550 | struct tcp_sock *tp = tcp_sk(sk); |
551 | |||
552 | if (!(flags & MSG_MORE) || forced_push(tp)) | 552 | if (!(flags & MSG_MORE) || forced_push(tp)) |
553 | tcp_mark_push(tp, skb); | 553 | tcp_mark_push(tp, tcp_write_queue_tail(sk)); |
554 | tcp_mark_urg(tp, flags, skb); | 554 | |
555 | tcp_mark_urg(tp, flags); | ||
555 | __tcp_push_pending_frames(sk, mss_now, | 556 | __tcp_push_pending_frames(sk, mss_now, |
556 | (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); | 557 | (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); |
557 | } | 558 | } |
@@ -876,12 +877,12 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, | |||
876 | #define TCP_PAGE(sk) (sk->sk_sndmsg_page) | 877 | #define TCP_PAGE(sk) (sk->sk_sndmsg_page) |
877 | #define TCP_OFF(sk) (sk->sk_sndmsg_off) | 878 | #define TCP_OFF(sk) (sk->sk_sndmsg_off) |
878 | 879 | ||
879 | static inline int select_size(struct sock *sk) | 880 | static inline int select_size(struct sock *sk, int sg) |
880 | { | 881 | { |
881 | struct tcp_sock *tp = tcp_sk(sk); | 882 | struct tcp_sock *tp = tcp_sk(sk); |
882 | int tmp = tp->mss_cache; | 883 | int tmp = tp->mss_cache; |
883 | 884 | ||
884 | if (sk->sk_route_caps & NETIF_F_SG) { | 885 | if (sg) { |
885 | if (sk_can_gso(sk)) | 886 | if (sk_can_gso(sk)) |
886 | tmp = 0; | 887 | tmp = 0; |
887 | else { | 888 | else { |
@@ -905,7 +906,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
905 | struct sk_buff *skb; | 906 | struct sk_buff *skb; |
906 | int iovlen, flags; | 907 | int iovlen, flags; |
907 | int mss_now, size_goal; | 908 | int mss_now, size_goal; |
908 | int err, copied; | 909 | int sg, err, copied; |
909 | long timeo; | 910 | long timeo; |
910 | 911 | ||
911 | lock_sock(sk); | 912 | lock_sock(sk); |
@@ -933,6 +934,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
933 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) | 934 | if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) |
934 | goto out_err; | 935 | goto out_err; |
935 | 936 | ||
937 | sg = sk->sk_route_caps & NETIF_F_SG; | ||
938 | |||
936 | while (--iovlen >= 0) { | 939 | while (--iovlen >= 0) { |
937 | int seglen = iov->iov_len; | 940 | int seglen = iov->iov_len; |
938 | unsigned char __user *from = iov->iov_base; | 941 | unsigned char __user *from = iov->iov_base; |
@@ -958,8 +961,9 @@ new_segment: | |||
958 | if (!sk_stream_memory_free(sk)) | 961 | if (!sk_stream_memory_free(sk)) |
959 | goto wait_for_sndbuf; | 962 | goto wait_for_sndbuf; |
960 | 963 | ||
961 | skb = sk_stream_alloc_skb(sk, select_size(sk), | 964 | skb = sk_stream_alloc_skb(sk, |
962 | sk->sk_allocation); | 965 | select_size(sk, sg), |
966 | sk->sk_allocation); | ||
963 | if (!skb) | 967 | if (!skb) |
964 | goto wait_for_memory; | 968 | goto wait_for_memory; |
965 | 969 | ||
@@ -996,9 +1000,7 @@ new_segment: | |||
996 | /* We can extend the last page | 1000 | /* We can extend the last page |
997 | * fragment. */ | 1001 | * fragment. */ |
998 | merge = 1; | 1002 | merge = 1; |
999 | } else if (i == MAX_SKB_FRAGS || | 1003 | } else if (i == MAX_SKB_FRAGS || !sg) { |
1000 | (!i && | ||
1001 | !(sk->sk_route_caps & NETIF_F_SG))) { | ||
1002 | /* Need to add new fragment and cannot | 1004 | /* Need to add new fragment and cannot |
1003 | * do this because interface is non-SG, | 1005 | * do this because interface is non-SG, |
1004 | * or because all the page slots are | 1006 | * or because all the page slots are |
@@ -1253,6 +1255,39 @@ static void tcp_prequeue_process(struct sock *sk) | |||
1253 | tp->ucopy.memory = 0; | 1255 | tp->ucopy.memory = 0; |
1254 | } | 1256 | } |
1255 | 1257 | ||
1258 | #ifdef CONFIG_NET_DMA | ||
1259 | static void tcp_service_net_dma(struct sock *sk, bool wait) | ||
1260 | { | ||
1261 | dma_cookie_t done, used; | ||
1262 | dma_cookie_t last_issued; | ||
1263 | struct tcp_sock *tp = tcp_sk(sk); | ||
1264 | |||
1265 | if (!tp->ucopy.dma_chan) | ||
1266 | return; | ||
1267 | |||
1268 | last_issued = tp->ucopy.dma_cookie; | ||
1269 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | ||
1270 | |||
1271 | do { | ||
1272 | if (dma_async_memcpy_complete(tp->ucopy.dma_chan, | ||
1273 | last_issued, &done, | ||
1274 | &used) == DMA_SUCCESS) { | ||
1275 | /* Safe to free early-copied skbs now */ | ||
1276 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1277 | break; | ||
1278 | } else { | ||
1279 | struct sk_buff *skb; | ||
1280 | while ((skb = skb_peek(&sk->sk_async_wait_queue)) && | ||
1281 | (dma_async_is_complete(skb->dma_cookie, done, | ||
1282 | used) == DMA_SUCCESS)) { | ||
1283 | __skb_dequeue(&sk->sk_async_wait_queue); | ||
1284 | kfree_skb(skb); | ||
1285 | } | ||
1286 | } | ||
1287 | } while (wait); | ||
1288 | } | ||
1289 | #endif | ||
1290 | |||
1256 | static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) | 1291 | static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) |
1257 | { | 1292 | { |
1258 | struct sk_buff *skb; | 1293 | struct sk_buff *skb; |
@@ -1334,6 +1369,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
1334 | sk_eat_skb(sk, skb, 0); | 1369 | sk_eat_skb(sk, skb, 0); |
1335 | if (!desc->count) | 1370 | if (!desc->count) |
1336 | break; | 1371 | break; |
1372 | tp->copied_seq = seq; | ||
1337 | } | 1373 | } |
1338 | tp->copied_seq = seq; | 1374 | tp->copied_seq = seq; |
1339 | 1375 | ||
@@ -1545,6 +1581,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1545 | /* __ Set realtime policy in scheduler __ */ | 1581 | /* __ Set realtime policy in scheduler __ */ |
1546 | } | 1582 | } |
1547 | 1583 | ||
1584 | #ifdef CONFIG_NET_DMA | ||
1585 | if (tp->ucopy.dma_chan) | ||
1586 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | ||
1587 | #endif | ||
1548 | if (copied >= target) { | 1588 | if (copied >= target) { |
1549 | /* Do not sleep, just process backlog. */ | 1589 | /* Do not sleep, just process backlog. */ |
1550 | release_sock(sk); | 1590 | release_sock(sk); |
@@ -1553,6 +1593,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1553 | sk_wait_data(sk, &timeo); | 1593 | sk_wait_data(sk, &timeo); |
1554 | 1594 | ||
1555 | #ifdef CONFIG_NET_DMA | 1595 | #ifdef CONFIG_NET_DMA |
1596 | tcp_service_net_dma(sk, false); /* Don't block */ | ||
1556 | tp->ucopy.wakeup = 0; | 1597 | tp->ucopy.wakeup = 0; |
1557 | #endif | 1598 | #endif |
1558 | 1599 | ||
@@ -1632,6 +1673,9 @@ do_prequeue: | |||
1632 | copied = -EFAULT; | 1673 | copied = -EFAULT; |
1633 | break; | 1674 | break; |
1634 | } | 1675 | } |
1676 | |||
1677 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | ||
1678 | |||
1635 | if ((offset + used) == skb->len) | 1679 | if ((offset + used) == skb->len) |
1636 | copied_early = 1; | 1680 | copied_early = 1; |
1637 | 1681 | ||
@@ -1701,27 +1745,9 @@ skip_copy: | |||
1701 | } | 1745 | } |
1702 | 1746 | ||
1703 | #ifdef CONFIG_NET_DMA | 1747 | #ifdef CONFIG_NET_DMA |
1704 | if (tp->ucopy.dma_chan) { | 1748 | tcp_service_net_dma(sk, true); /* Wait for queue to drain */ |
1705 | dma_cookie_t done, used; | 1749 | tp->ucopy.dma_chan = NULL; |
1706 | |||
1707 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | ||
1708 | |||
1709 | while (dma_async_memcpy_complete(tp->ucopy.dma_chan, | ||
1710 | tp->ucopy.dma_cookie, &done, | ||
1711 | &used) == DMA_IN_PROGRESS) { | ||
1712 | /* do partial cleanup of sk_async_wait_queue */ | ||
1713 | while ((skb = skb_peek(&sk->sk_async_wait_queue)) && | ||
1714 | (dma_async_is_complete(skb->dma_cookie, done, | ||
1715 | used) == DMA_SUCCESS)) { | ||
1716 | __skb_dequeue(&sk->sk_async_wait_queue); | ||
1717 | kfree_skb(skb); | ||
1718 | } | ||
1719 | } | ||
1720 | 1750 | ||
1721 | /* Safe to free early-copied skbs now */ | ||
1722 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1723 | tp->ucopy.dma_chan = NULL; | ||
1724 | } | ||
1725 | if (tp->ucopy.pinned_list) { | 1751 | if (tp->ucopy.pinned_list) { |
1726 | dma_unpin_iovec_pages(tp->ucopy.pinned_list); | 1752 | dma_unpin_iovec_pages(tp->ucopy.pinned_list); |
1727 | tp->ucopy.pinned_list = NULL; | 1753 | tp->ucopy.pinned_list = NULL; |
@@ -2042,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
2042 | __skb_queue_purge(&sk->sk_async_wait_queue); | 2068 | __skb_queue_purge(&sk->sk_async_wait_queue); |
2043 | #endif | 2069 | #endif |
2044 | 2070 | ||
2045 | inet->dport = 0; | 2071 | inet->inet_dport = 0; |
2046 | 2072 | ||
2047 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) | 2073 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) |
2048 | inet_reset_saddr(sk); | 2074 | inet_reset_saddr(sk); |
@@ -2059,6 +2085,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
2059 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 2085 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
2060 | tp->snd_cwnd_cnt = 0; | 2086 | tp->snd_cwnd_cnt = 0; |
2061 | tp->bytes_acked = 0; | 2087 | tp->bytes_acked = 0; |
2088 | tp->window_clamp = 0; | ||
2062 | tcp_set_ca_state(sk, TCP_CA_Open); | 2089 | tcp_set_ca_state(sk, TCP_CA_Open); |
2063 | tcp_clear_retrans(tp); | 2090 | tcp_clear_retrans(tp); |
2064 | inet_csk_delack_init(sk); | 2091 | inet_csk_delack_init(sk); |
@@ -2066,7 +2093,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
2066 | memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); | 2093 | memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); |
2067 | __sk_dst_reset(sk); | 2094 | __sk_dst_reset(sk); |
2068 | 2095 | ||
2069 | WARN_ON(inet->num && !icsk->icsk_bind_hash); | 2096 | WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); |
2070 | 2097 | ||
2071 | sk->sk_error_report(sk); | 2098 | sk->sk_error_report(sk); |
2072 | return err; | 2099 | return err; |
@@ -2083,8 +2110,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2083 | int val; | 2110 | int val; |
2084 | int err = 0; | 2111 | int err = 0; |
2085 | 2112 | ||
2086 | /* This is a string value all the others are int's */ | 2113 | /* These are data/string values, all the others are ints */ |
2087 | if (optname == TCP_CONGESTION) { | 2114 | switch (optname) { |
2115 | case TCP_CONGESTION: { | ||
2088 | char name[TCP_CA_NAME_MAX]; | 2116 | char name[TCP_CA_NAME_MAX]; |
2089 | 2117 | ||
2090 | if (optlen < 1) | 2118 | if (optlen < 1) |
@@ -2101,6 +2129,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2101 | release_sock(sk); | 2129 | release_sock(sk); |
2102 | return err; | 2130 | return err; |
2103 | } | 2131 | } |
2132 | case TCP_COOKIE_TRANSACTIONS: { | ||
2133 | struct tcp_cookie_transactions ctd; | ||
2134 | struct tcp_cookie_values *cvp = NULL; | ||
2135 | |||
2136 | if (sizeof(ctd) > optlen) | ||
2137 | return -EINVAL; | ||
2138 | if (copy_from_user(&ctd, optval, sizeof(ctd))) | ||
2139 | return -EFAULT; | ||
2140 | |||
2141 | if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || | ||
2142 | ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) | ||
2143 | return -EINVAL; | ||
2144 | |||
2145 | if (ctd.tcpct_cookie_desired == 0) { | ||
2146 | /* default to global value */ | ||
2147 | } else if ((0x1 & ctd.tcpct_cookie_desired) || | ||
2148 | ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || | ||
2149 | ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { | ||
2150 | return -EINVAL; | ||
2151 | } | ||
2152 | |||
2153 | if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { | ||
2154 | /* Supercedes all other values */ | ||
2155 | lock_sock(sk); | ||
2156 | if (tp->cookie_values != NULL) { | ||
2157 | kref_put(&tp->cookie_values->kref, | ||
2158 | tcp_cookie_values_release); | ||
2159 | tp->cookie_values = NULL; | ||
2160 | } | ||
2161 | tp->rx_opt.cookie_in_always = 0; /* false */ | ||
2162 | tp->rx_opt.cookie_out_never = 1; /* true */ | ||
2163 | release_sock(sk); | ||
2164 | return err; | ||
2165 | } | ||
2166 | |||
2167 | /* Allocate ancillary memory before locking. | ||
2168 | */ | ||
2169 | if (ctd.tcpct_used > 0 || | ||
2170 | (tp->cookie_values == NULL && | ||
2171 | (sysctl_tcp_cookie_size > 0 || | ||
2172 | ctd.tcpct_cookie_desired > 0 || | ||
2173 | ctd.tcpct_s_data_desired > 0))) { | ||
2174 | cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, | ||
2175 | GFP_KERNEL); | ||
2176 | if (cvp == NULL) | ||
2177 | return -ENOMEM; | ||
2178 | } | ||
2179 | lock_sock(sk); | ||
2180 | tp->rx_opt.cookie_in_always = | ||
2181 | (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); | ||
2182 | tp->rx_opt.cookie_out_never = 0; /* false */ | ||
2183 | |||
2184 | if (tp->cookie_values != NULL) { | ||
2185 | if (cvp != NULL) { | ||
2186 | /* Changed values are recorded by a changed | ||
2187 | * pointer, ensuring the cookie will differ, | ||
2188 | * without separately hashing each value later. | ||
2189 | */ | ||
2190 | kref_put(&tp->cookie_values->kref, | ||
2191 | tcp_cookie_values_release); | ||
2192 | kref_init(&cvp->kref); | ||
2193 | tp->cookie_values = cvp; | ||
2194 | } else { | ||
2195 | cvp = tp->cookie_values; | ||
2196 | } | ||
2197 | } | ||
2198 | if (cvp != NULL) { | ||
2199 | cvp->cookie_desired = ctd.tcpct_cookie_desired; | ||
2200 | |||
2201 | if (ctd.tcpct_used > 0) { | ||
2202 | memcpy(cvp->s_data_payload, ctd.tcpct_value, | ||
2203 | ctd.tcpct_used); | ||
2204 | cvp->s_data_desired = ctd.tcpct_used; | ||
2205 | cvp->s_data_constant = 1; /* true */ | ||
2206 | } else { | ||
2207 | /* No constant payload data. */ | ||
2208 | cvp->s_data_desired = ctd.tcpct_s_data_desired; | ||
2209 | cvp->s_data_constant = 0; /* false */ | ||
2210 | } | ||
2211 | } | ||
2212 | release_sock(sk); | ||
2213 | return err; | ||
2214 | } | ||
2215 | default: | ||
2216 | /* fallthru */ | ||
2217 | break; | ||
2218 | }; | ||
2104 | 2219 | ||
2105 | if (optlen < sizeof(int)) | 2220 | if (optlen < sizeof(int)) |
2106 | return -EINVAL; | 2221 | return -EINVAL; |
@@ -2139,6 +2254,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2139 | } | 2254 | } |
2140 | break; | 2255 | break; |
2141 | 2256 | ||
2257 | case TCP_THIN_LINEAR_TIMEOUTS: | ||
2258 | if (val < 0 || val > 1) | ||
2259 | err = -EINVAL; | ||
2260 | else | ||
2261 | tp->thin_lto = val; | ||
2262 | break; | ||
2263 | |||
2264 | case TCP_THIN_DUPACK: | ||
2265 | if (val < 0 || val > 1) | ||
2266 | err = -EINVAL; | ||
2267 | else | ||
2268 | tp->thin_dupack = val; | ||
2269 | break; | ||
2270 | |||
2142 | case TCP_CORK: | 2271 | case TCP_CORK: |
2143 | /* When set indicates to always queue non-full frames. | 2272 | /* When set indicates to always queue non-full frames. |
2144 | * Later the user clears this option and we transmit | 2273 | * Later the user clears this option and we transmit |
@@ -2425,6 +2554,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
2425 | if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) | 2554 | if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) |
2426 | return -EFAULT; | 2555 | return -EFAULT; |
2427 | return 0; | 2556 | return 0; |
2557 | |||
2558 | case TCP_COOKIE_TRANSACTIONS: { | ||
2559 | struct tcp_cookie_transactions ctd; | ||
2560 | struct tcp_cookie_values *cvp = tp->cookie_values; | ||
2561 | |||
2562 | if (get_user(len, optlen)) | ||
2563 | return -EFAULT; | ||
2564 | if (len < sizeof(ctd)) | ||
2565 | return -EINVAL; | ||
2566 | |||
2567 | memset(&ctd, 0, sizeof(ctd)); | ||
2568 | ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? | ||
2569 | TCP_COOKIE_IN_ALWAYS : 0) | ||
2570 | | (tp->rx_opt.cookie_out_never ? | ||
2571 | TCP_COOKIE_OUT_NEVER : 0); | ||
2572 | |||
2573 | if (cvp != NULL) { | ||
2574 | ctd.tcpct_flags |= (cvp->s_data_in ? | ||
2575 | TCP_S_DATA_IN : 0) | ||
2576 | | (cvp->s_data_out ? | ||
2577 | TCP_S_DATA_OUT : 0); | ||
2578 | |||
2579 | ctd.tcpct_cookie_desired = cvp->cookie_desired; | ||
2580 | ctd.tcpct_s_data_desired = cvp->s_data_desired; | ||
2581 | |||
2582 | memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], | ||
2583 | cvp->cookie_pair_size); | ||
2584 | ctd.tcpct_used = cvp->cookie_pair_size; | ||
2585 | } | ||
2586 | |||
2587 | if (put_user(sizeof(ctd), optlen)) | ||
2588 | return -EFAULT; | ||
2589 | if (copy_to_user(optval, &ctd, sizeof(ctd))) | ||
2590 | return -EFAULT; | ||
2591 | return 0; | ||
2592 | } | ||
2428 | default: | 2593 | default: |
2429 | return -ENOPROTOOPT; | 2594 | return -ENOPROTOOPT; |
2430 | } | 2595 | } |
@@ -2662,10 +2827,10 @@ EXPORT_SYMBOL(tcp_gro_complete); | |||
2662 | 2827 | ||
2663 | #ifdef CONFIG_TCP_MD5SIG | 2828 | #ifdef CONFIG_TCP_MD5SIG |
2664 | static unsigned long tcp_md5sig_users; | 2829 | static unsigned long tcp_md5sig_users; |
2665 | static struct tcp_md5sig_pool **tcp_md5sig_pool; | 2830 | static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; |
2666 | static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); | 2831 | static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); |
2667 | 2832 | ||
2668 | static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) | 2833 | static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) |
2669 | { | 2834 | { |
2670 | int cpu; | 2835 | int cpu; |
2671 | for_each_possible_cpu(cpu) { | 2836 | for_each_possible_cpu(cpu) { |
@@ -2674,7 +2839,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) | |||
2674 | if (p->md5_desc.tfm) | 2839 | if (p->md5_desc.tfm) |
2675 | crypto_free_hash(p->md5_desc.tfm); | 2840 | crypto_free_hash(p->md5_desc.tfm); |
2676 | kfree(p); | 2841 | kfree(p); |
2677 | p = NULL; | ||
2678 | } | 2842 | } |
2679 | } | 2843 | } |
2680 | free_percpu(pool); | 2844 | free_percpu(pool); |
@@ -2682,7 +2846,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) | |||
2682 | 2846 | ||
2683 | void tcp_free_md5sig_pool(void) | 2847 | void tcp_free_md5sig_pool(void) |
2684 | { | 2848 | { |
2685 | struct tcp_md5sig_pool **pool = NULL; | 2849 | struct tcp_md5sig_pool * __percpu *pool = NULL; |
2686 | 2850 | ||
2687 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2851 | spin_lock_bh(&tcp_md5sig_pool_lock); |
2688 | if (--tcp_md5sig_users == 0) { | 2852 | if (--tcp_md5sig_users == 0) { |
@@ -2696,10 +2860,11 @@ void tcp_free_md5sig_pool(void) | |||
2696 | 2860 | ||
2697 | EXPORT_SYMBOL(tcp_free_md5sig_pool); | 2861 | EXPORT_SYMBOL(tcp_free_md5sig_pool); |
2698 | 2862 | ||
2699 | static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk) | 2863 | static struct tcp_md5sig_pool * __percpu * |
2864 | __tcp_alloc_md5sig_pool(struct sock *sk) | ||
2700 | { | 2865 | { |
2701 | int cpu; | 2866 | int cpu; |
2702 | struct tcp_md5sig_pool **pool; | 2867 | struct tcp_md5sig_pool * __percpu *pool; |
2703 | 2868 | ||
2704 | pool = alloc_percpu(struct tcp_md5sig_pool *); | 2869 | pool = alloc_percpu(struct tcp_md5sig_pool *); |
2705 | if (!pool) | 2870 | if (!pool) |
@@ -2726,9 +2891,9 @@ out_free: | |||
2726 | return NULL; | 2891 | return NULL; |
2727 | } | 2892 | } |
2728 | 2893 | ||
2729 | struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk) | 2894 | struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) |
2730 | { | 2895 | { |
2731 | struct tcp_md5sig_pool **pool; | 2896 | struct tcp_md5sig_pool * __percpu *pool; |
2732 | int alloc = 0; | 2897 | int alloc = 0; |
2733 | 2898 | ||
2734 | retry: | 2899 | retry: |
@@ -2747,7 +2912,9 @@ retry: | |||
2747 | 2912 | ||
2748 | if (alloc) { | 2913 | if (alloc) { |
2749 | /* we cannot hold spinlock here because this may sleep. */ | 2914 | /* we cannot hold spinlock here because this may sleep. */ |
2750 | struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk); | 2915 | struct tcp_md5sig_pool * __percpu *p; |
2916 | |||
2917 | p = __tcp_alloc_md5sig_pool(sk); | ||
2751 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2918 | spin_lock_bh(&tcp_md5sig_pool_lock); |
2752 | if (!p) { | 2919 | if (!p) { |
2753 | tcp_md5sig_users--; | 2920 | tcp_md5sig_users--; |
@@ -2769,25 +2936,40 @@ retry: | |||
2769 | 2936 | ||
2770 | EXPORT_SYMBOL(tcp_alloc_md5sig_pool); | 2937 | EXPORT_SYMBOL(tcp_alloc_md5sig_pool); |
2771 | 2938 | ||
2772 | struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) | 2939 | |
2940 | /** | ||
2941 | * tcp_get_md5sig_pool - get md5sig_pool for this user | ||
2942 | * | ||
2943 | * We use percpu structure, so if we succeed, we exit with preemption | ||
2944 | * and BH disabled, to make sure another thread or softirq handling | ||
2945 | * wont try to get same context. | ||
2946 | */ | ||
2947 | struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) | ||
2773 | { | 2948 | { |
2774 | struct tcp_md5sig_pool **p; | 2949 | struct tcp_md5sig_pool * __percpu *p; |
2775 | spin_lock_bh(&tcp_md5sig_pool_lock); | 2950 | |
2951 | local_bh_disable(); | ||
2952 | |||
2953 | spin_lock(&tcp_md5sig_pool_lock); | ||
2776 | p = tcp_md5sig_pool; | 2954 | p = tcp_md5sig_pool; |
2777 | if (p) | 2955 | if (p) |
2778 | tcp_md5sig_users++; | 2956 | tcp_md5sig_users++; |
2779 | spin_unlock_bh(&tcp_md5sig_pool_lock); | 2957 | spin_unlock(&tcp_md5sig_pool_lock); |
2780 | return (p ? *per_cpu_ptr(p, cpu) : NULL); | ||
2781 | } | ||
2782 | 2958 | ||
2783 | EXPORT_SYMBOL(__tcp_get_md5sig_pool); | 2959 | if (p) |
2960 | return *per_cpu_ptr(p, smp_processor_id()); | ||
2961 | |||
2962 | local_bh_enable(); | ||
2963 | return NULL; | ||
2964 | } | ||
2965 | EXPORT_SYMBOL(tcp_get_md5sig_pool); | ||
2784 | 2966 | ||
2785 | void __tcp_put_md5sig_pool(void) | 2967 | void tcp_put_md5sig_pool(void) |
2786 | { | 2968 | { |
2969 | local_bh_enable(); | ||
2787 | tcp_free_md5sig_pool(); | 2970 | tcp_free_md5sig_pool(); |
2788 | } | 2971 | } |
2789 | 2972 | EXPORT_SYMBOL(tcp_put_md5sig_pool); | |
2790 | EXPORT_SYMBOL(__tcp_put_md5sig_pool); | ||
2791 | 2973 | ||
2792 | int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, | 2974 | int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, |
2793 | struct tcphdr *th) | 2975 | struct tcphdr *th) |
@@ -2847,6 +3029,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key); | |||
2847 | 3029 | ||
2848 | #endif | 3030 | #endif |
2849 | 3031 | ||
3032 | /** | ||
3033 | * Each Responder maintains up to two secret values concurrently for | ||
3034 | * efficient secret rollover. Each secret value has 4 states: | ||
3035 | * | ||
3036 | * Generating. (tcp_secret_generating != tcp_secret_primary) | ||
3037 | * Generates new Responder-Cookies, but not yet used for primary | ||
3038 | * verification. This is a short-term state, typically lasting only | ||
3039 | * one round trip time (RTT). | ||
3040 | * | ||
3041 | * Primary. (tcp_secret_generating == tcp_secret_primary) | ||
3042 | * Used both for generation and primary verification. | ||
3043 | * | ||
3044 | * Retiring. (tcp_secret_retiring != tcp_secret_secondary) | ||
3045 | * Used for verification, until the first failure that can be | ||
3046 | * verified by the newer Generating secret. At that time, this | ||
3047 | * cookie's state is changed to Secondary, and the Generating | ||
3048 | * cookie's state is changed to Primary. This is a short-term state, | ||
3049 | * typically lasting only one round trip time (RTT). | ||
3050 | * | ||
3051 | * Secondary. (tcp_secret_retiring == tcp_secret_secondary) | ||
3052 | * Used for secondary verification, after primary verification | ||
3053 | * failures. This state lasts no more than twice the Maximum Segment | ||
3054 | * Lifetime (2MSL). Then, the secret is discarded. | ||
3055 | */ | ||
3056 | struct tcp_cookie_secret { | ||
3057 | /* The secret is divided into two parts. The digest part is the | ||
3058 | * equivalent of previously hashing a secret and saving the state, | ||
3059 | * and serves as an initialization vector (IV). The message part | ||
3060 | * serves as the trailing secret. | ||
3061 | */ | ||
3062 | u32 secrets[COOKIE_WORKSPACE_WORDS]; | ||
3063 | unsigned long expires; | ||
3064 | }; | ||
3065 | |||
3066 | #define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) | ||
3067 | #define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) | ||
3068 | #define TCP_SECRET_LIFE (HZ * 600) | ||
3069 | |||
3070 | static struct tcp_cookie_secret tcp_secret_one; | ||
3071 | static struct tcp_cookie_secret tcp_secret_two; | ||
3072 | |||
3073 | /* Essentially a circular list, without dynamic allocation. */ | ||
3074 | static struct tcp_cookie_secret *tcp_secret_generating; | ||
3075 | static struct tcp_cookie_secret *tcp_secret_primary; | ||
3076 | static struct tcp_cookie_secret *tcp_secret_retiring; | ||
3077 | static struct tcp_cookie_secret *tcp_secret_secondary; | ||
3078 | |||
3079 | static DEFINE_SPINLOCK(tcp_secret_locker); | ||
3080 | |||
3081 | /* Select a pseudo-random word in the cookie workspace. | ||
3082 | */ | ||
3083 | static inline u32 tcp_cookie_work(const u32 *ws, const int n) | ||
3084 | { | ||
3085 | return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; | ||
3086 | } | ||
3087 | |||
3088 | /* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. | ||
3089 | * Called in softirq context. | ||
3090 | * Returns: 0 for success. | ||
3091 | */ | ||
3092 | int tcp_cookie_generator(u32 *bakery) | ||
3093 | { | ||
3094 | unsigned long jiffy = jiffies; | ||
3095 | |||
3096 | if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { | ||
3097 | spin_lock_bh(&tcp_secret_locker); | ||
3098 | if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { | ||
3099 | /* refreshed by another */ | ||
3100 | memcpy(bakery, | ||
3101 | &tcp_secret_generating->secrets[0], | ||
3102 | COOKIE_WORKSPACE_WORDS); | ||
3103 | } else { | ||
3104 | /* still needs refreshing */ | ||
3105 | get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); | ||
3106 | |||
3107 | /* The first time, paranoia assumes that the | ||
3108 | * randomization function isn't as strong. But, | ||
3109 | * this secret initialization is delayed until | ||
3110 | * the last possible moment (packet arrival). | ||
3111 | * Although that time is observable, it is | ||
3112 | * unpredictably variable. Mash in the most | ||
3113 | * volatile clock bits available, and expire the | ||
3114 | * secret extra quickly. | ||
3115 | */ | ||
3116 | if (unlikely(tcp_secret_primary->expires == | ||
3117 | tcp_secret_secondary->expires)) { | ||
3118 | struct timespec tv; | ||
3119 | |||
3120 | getnstimeofday(&tv); | ||
3121 | bakery[COOKIE_DIGEST_WORDS+0] ^= | ||
3122 | (u32)tv.tv_nsec; | ||
3123 | |||
3124 | tcp_secret_secondary->expires = jiffy | ||
3125 | + TCP_SECRET_1MSL | ||
3126 | + (0x0f & tcp_cookie_work(bakery, 0)); | ||
3127 | } else { | ||
3128 | tcp_secret_secondary->expires = jiffy | ||
3129 | + TCP_SECRET_LIFE | ||
3130 | + (0xff & tcp_cookie_work(bakery, 1)); | ||
3131 | tcp_secret_primary->expires = jiffy | ||
3132 | + TCP_SECRET_2MSL | ||
3133 | + (0x1f & tcp_cookie_work(bakery, 2)); | ||
3134 | } | ||
3135 | memcpy(&tcp_secret_secondary->secrets[0], | ||
3136 | bakery, COOKIE_WORKSPACE_WORDS); | ||
3137 | |||
3138 | rcu_assign_pointer(tcp_secret_generating, | ||
3139 | tcp_secret_secondary); | ||
3140 | rcu_assign_pointer(tcp_secret_retiring, | ||
3141 | tcp_secret_primary); | ||
3142 | /* | ||
3143 | * Neither call_rcu() nor synchronize_rcu() needed. | ||
3144 | * Retiring data is not freed. It is replaced after | ||
3145 | * further (locked) pointer updates, and a quiet time | ||
3146 | * (minimum 1MSL, maximum LIFE - 2MSL). | ||
3147 | */ | ||
3148 | } | ||
3149 | spin_unlock_bh(&tcp_secret_locker); | ||
3150 | } else { | ||
3151 | rcu_read_lock_bh(); | ||
3152 | memcpy(bakery, | ||
3153 | &rcu_dereference(tcp_secret_generating)->secrets[0], | ||
3154 | COOKIE_WORKSPACE_WORDS); | ||
3155 | rcu_read_unlock_bh(); | ||
3156 | } | ||
3157 | return 0; | ||
3158 | } | ||
3159 | EXPORT_SYMBOL(tcp_cookie_generator); | ||
3160 | |||
2850 | void tcp_done(struct sock *sk) | 3161 | void tcp_done(struct sock *sk) |
2851 | { | 3162 | { |
2852 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) | 3163 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) |
@@ -2881,6 +3192,7 @@ void __init tcp_init(void) | |||
2881 | struct sk_buff *skb = NULL; | 3192 | struct sk_buff *skb = NULL; |
2882 | unsigned long nr_pages, limit; | 3193 | unsigned long nr_pages, limit; |
2883 | int order, i, max_share; | 3194 | int order, i, max_share; |
3195 | unsigned long jiffy = jiffies; | ||
2884 | 3196 | ||
2885 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); | 3197 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); |
2886 | 3198 | ||
@@ -2903,11 +3215,10 @@ void __init tcp_init(void) | |||
2903 | (totalram_pages >= 128 * 1024) ? | 3215 | (totalram_pages >= 128 * 1024) ? |
2904 | 13 : 15, | 3216 | 13 : 15, |
2905 | 0, | 3217 | 0, |
2906 | &tcp_hashinfo.ehash_size, | ||
2907 | NULL, | 3218 | NULL, |
3219 | &tcp_hashinfo.ehash_mask, | ||
2908 | thash_entries ? 0 : 512 * 1024); | 3220 | thash_entries ? 0 : 512 * 1024); |
2909 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; | 3221 | for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { |
2910 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { | ||
2911 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); | 3222 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); |
2912 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); | 3223 | INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); |
2913 | } | 3224 | } |
@@ -2916,7 +3227,7 @@ void __init tcp_init(void) | |||
2916 | tcp_hashinfo.bhash = | 3227 | tcp_hashinfo.bhash = |
2917 | alloc_large_system_hash("TCP bind", | 3228 | alloc_large_system_hash("TCP bind", |
2918 | sizeof(struct inet_bind_hashbucket), | 3229 | sizeof(struct inet_bind_hashbucket), |
2919 | tcp_hashinfo.ehash_size, | 3230 | tcp_hashinfo.ehash_mask + 1, |
2920 | (totalram_pages >= 128 * 1024) ? | 3231 | (totalram_pages >= 128 * 1024) ? |
2921 | 13 : 15, | 3232 | 13 : 15, |
2922 | 0, | 3233 | 0, |
@@ -2971,10 +3282,19 @@ void __init tcp_init(void) | |||
2971 | sysctl_tcp_rmem[2] = max(87380, max_share); | 3282 | sysctl_tcp_rmem[2] = max(87380, max_share); |
2972 | 3283 | ||
2973 | printk(KERN_INFO "TCP: Hash tables configured " | 3284 | printk(KERN_INFO "TCP: Hash tables configured " |
2974 | "(established %d bind %d)\n", | 3285 | "(established %u bind %u)\n", |
2975 | tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); | 3286 | tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); |
2976 | 3287 | ||
2977 | tcp_register_congestion_control(&tcp_reno); | 3288 | tcp_register_congestion_control(&tcp_reno); |
3289 | |||
3290 | memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); | ||
3291 | memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); | ||
3292 | tcp_secret_one.expires = jiffy; /* past due */ | ||
3293 | tcp_secret_two.expires = jiffy; /* past due */ | ||
3294 | tcp_secret_generating = &tcp_secret_one; | ||
3295 | tcp_secret_primary = &tcp_secret_one; | ||
3296 | tcp_secret_retiring = &tcp_secret_two; | ||
3297 | tcp_secret_secondary = &tcp_secret_two; | ||
2978 | } | 3298 | } |
2979 | 3299 | ||
2980 | EXPORT_SYMBOL(tcp_close); | 3300 | EXPORT_SYMBOL(tcp_close); |