aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c450
1 files changed, 385 insertions, 65 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f1813bc71088..296150b2a62f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,8 @@
264#include <linux/cache.h> 264#include <linux/cache.h>
265#include <linux/err.h> 265#include <linux/err.h>
266#include <linux/crypto.h> 266#include <linux/crypto.h>
267#include <linux/time.h>
268#include <linux/slab.h>
267 269
268#include <net/icmp.h> 270#include <net/icmp.h>
269#include <net/tcp.h> 271#include <net/tcp.h>
@@ -428,7 +430,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
428 if (tp->urg_seq == tp->copied_seq && 430 if (tp->urg_seq == tp->copied_seq &&
429 !sock_flag(sk, SOCK_URGINLINE) && 431 !sock_flag(sk, SOCK_URGINLINE) &&
430 tp->urg_data) 432 tp->urg_data)
431 target--; 433 target++;
432 434
433 /* Potential race condition. If read of tp below will 435 /* Potential race condition. If read of tp below will
434 * escape above sk->sk_state, we can be illegally awaken 436 * escape above sk->sk_state, we can be illegally awaken
@@ -535,8 +537,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
535 tp->nonagle &= ~TCP_NAGLE_PUSH; 537 tp->nonagle &= ~TCP_NAGLE_PUSH;
536} 538}
537 539
538static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, 540static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
539 struct sk_buff *skb)
540{ 541{
541 if (flags & MSG_OOB) 542 if (flags & MSG_OOB)
542 tp->snd_up = tp->write_seq; 543 tp->snd_up = tp->write_seq;
@@ -545,13 +546,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
545static inline void tcp_push(struct sock *sk, int flags, int mss_now, 546static inline void tcp_push(struct sock *sk, int flags, int mss_now,
546 int nonagle) 547 int nonagle)
547{ 548{
548 struct tcp_sock *tp = tcp_sk(sk);
549
550 if (tcp_send_head(sk)) { 549 if (tcp_send_head(sk)) {
551 struct sk_buff *skb = tcp_write_queue_tail(sk); 550 struct tcp_sock *tp = tcp_sk(sk);
551
552 if (!(flags & MSG_MORE) || forced_push(tp)) 552 if (!(flags & MSG_MORE) || forced_push(tp))
553 tcp_mark_push(tp, skb); 553 tcp_mark_push(tp, tcp_write_queue_tail(sk));
554 tcp_mark_urg(tp, flags, skb); 554
555 tcp_mark_urg(tp, flags);
555 __tcp_push_pending_frames(sk, mss_now, 556 __tcp_push_pending_frames(sk, mss_now,
556 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 557 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
557 } 558 }
@@ -876,12 +877,12 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
876#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 877#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
877#define TCP_OFF(sk) (sk->sk_sndmsg_off) 878#define TCP_OFF(sk) (sk->sk_sndmsg_off)
878 879
879static inline int select_size(struct sock *sk) 880static inline int select_size(struct sock *sk, int sg)
880{ 881{
881 struct tcp_sock *tp = tcp_sk(sk); 882 struct tcp_sock *tp = tcp_sk(sk);
882 int tmp = tp->mss_cache; 883 int tmp = tp->mss_cache;
883 884
884 if (sk->sk_route_caps & NETIF_F_SG) { 885 if (sg) {
885 if (sk_can_gso(sk)) 886 if (sk_can_gso(sk))
886 tmp = 0; 887 tmp = 0;
887 else { 888 else {
@@ -905,7 +906,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
905 struct sk_buff *skb; 906 struct sk_buff *skb;
906 int iovlen, flags; 907 int iovlen, flags;
907 int mss_now, size_goal; 908 int mss_now, size_goal;
908 int err, copied; 909 int sg, err, copied;
909 long timeo; 910 long timeo;
910 911
911 lock_sock(sk); 912 lock_sock(sk);
@@ -933,6 +934,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
933 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 934 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
934 goto out_err; 935 goto out_err;
935 936
937 sg = sk->sk_route_caps & NETIF_F_SG;
938
936 while (--iovlen >= 0) { 939 while (--iovlen >= 0) {
937 int seglen = iov->iov_len; 940 int seglen = iov->iov_len;
938 unsigned char __user *from = iov->iov_base; 941 unsigned char __user *from = iov->iov_base;
@@ -958,8 +961,9 @@ new_segment:
958 if (!sk_stream_memory_free(sk)) 961 if (!sk_stream_memory_free(sk))
959 goto wait_for_sndbuf; 962 goto wait_for_sndbuf;
960 963
961 skb = sk_stream_alloc_skb(sk, select_size(sk), 964 skb = sk_stream_alloc_skb(sk,
962 sk->sk_allocation); 965 select_size(sk, sg),
966 sk->sk_allocation);
963 if (!skb) 967 if (!skb)
964 goto wait_for_memory; 968 goto wait_for_memory;
965 969
@@ -996,9 +1000,7 @@ new_segment:
996 /* We can extend the last page 1000 /* We can extend the last page
997 * fragment. */ 1001 * fragment. */
998 merge = 1; 1002 merge = 1;
999 } else if (i == MAX_SKB_FRAGS || 1003 } else if (i == MAX_SKB_FRAGS || !sg) {
1000 (!i &&
1001 !(sk->sk_route_caps & NETIF_F_SG))) {
1002 /* Need to add new fragment and cannot 1004 /* Need to add new fragment and cannot
1003 * do this because interface is non-SG, 1005 * do this because interface is non-SG,
1004 * or because all the page slots are 1006 * or because all the page slots are
@@ -1253,6 +1255,39 @@ static void tcp_prequeue_process(struct sock *sk)
1253 tp->ucopy.memory = 0; 1255 tp->ucopy.memory = 0;
1254} 1256}
1255 1257
1258#ifdef CONFIG_NET_DMA
1259static void tcp_service_net_dma(struct sock *sk, bool wait)
1260{
1261 dma_cookie_t done, used;
1262 dma_cookie_t last_issued;
1263 struct tcp_sock *tp = tcp_sk(sk);
1264
1265 if (!tp->ucopy.dma_chan)
1266 return;
1267
1268 last_issued = tp->ucopy.dma_cookie;
1269 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1270
1271 do {
1272 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1273 last_issued, &done,
1274 &used) == DMA_SUCCESS) {
1275 /* Safe to free early-copied skbs now */
1276 __skb_queue_purge(&sk->sk_async_wait_queue);
1277 break;
1278 } else {
1279 struct sk_buff *skb;
1280 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1281 (dma_async_is_complete(skb->dma_cookie, done,
1282 used) == DMA_SUCCESS)) {
1283 __skb_dequeue(&sk->sk_async_wait_queue);
1284 kfree_skb(skb);
1285 }
1286 }
1287 } while (wait);
1288}
1289#endif
1290
1256static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1291static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1257{ 1292{
1258 struct sk_buff *skb; 1293 struct sk_buff *skb;
@@ -1334,6 +1369,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1334 sk_eat_skb(sk, skb, 0); 1369 sk_eat_skb(sk, skb, 0);
1335 if (!desc->count) 1370 if (!desc->count)
1336 break; 1371 break;
1372 tp->copied_seq = seq;
1337 } 1373 }
1338 tp->copied_seq = seq; 1374 tp->copied_seq = seq;
1339 1375
@@ -1545,6 +1581,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1545 /* __ Set realtime policy in scheduler __ */ 1581 /* __ Set realtime policy in scheduler __ */
1546 } 1582 }
1547 1583
1584#ifdef CONFIG_NET_DMA
1585 if (tp->ucopy.dma_chan)
1586 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1587#endif
1548 if (copied >= target) { 1588 if (copied >= target) {
1549 /* Do not sleep, just process backlog. */ 1589 /* Do not sleep, just process backlog. */
1550 release_sock(sk); 1590 release_sock(sk);
@@ -1553,6 +1593,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1553 sk_wait_data(sk, &timeo); 1593 sk_wait_data(sk, &timeo);
1554 1594
1555#ifdef CONFIG_NET_DMA 1595#ifdef CONFIG_NET_DMA
1596 tcp_service_net_dma(sk, false); /* Don't block */
1556 tp->ucopy.wakeup = 0; 1597 tp->ucopy.wakeup = 0;
1557#endif 1598#endif
1558 1599
@@ -1632,6 +1673,9 @@ do_prequeue:
1632 copied = -EFAULT; 1673 copied = -EFAULT;
1633 break; 1674 break;
1634 } 1675 }
1676
1677 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1678
1635 if ((offset + used) == skb->len) 1679 if ((offset + used) == skb->len)
1636 copied_early = 1; 1680 copied_early = 1;
1637 1681
@@ -1701,27 +1745,9 @@ skip_copy:
1701 } 1745 }
1702 1746
1703#ifdef CONFIG_NET_DMA 1747#ifdef CONFIG_NET_DMA
1704 if (tp->ucopy.dma_chan) { 1748 tcp_service_net_dma(sk, true); /* Wait for queue to drain */
1705 dma_cookie_t done, used; 1749 tp->ucopy.dma_chan = NULL;
1706
1707 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1708
1709 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1710 tp->ucopy.dma_cookie, &done,
1711 &used) == DMA_IN_PROGRESS) {
1712 /* do partial cleanup of sk_async_wait_queue */
1713 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1714 (dma_async_is_complete(skb->dma_cookie, done,
1715 used) == DMA_SUCCESS)) {
1716 __skb_dequeue(&sk->sk_async_wait_queue);
1717 kfree_skb(skb);
1718 }
1719 }
1720 1750
1721 /* Safe to free early-copied skbs now */
1722 __skb_queue_purge(&sk->sk_async_wait_queue);
1723 tp->ucopy.dma_chan = NULL;
1724 }
1725 if (tp->ucopy.pinned_list) { 1751 if (tp->ucopy.pinned_list) {
1726 dma_unpin_iovec_pages(tp->ucopy.pinned_list); 1752 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1727 tp->ucopy.pinned_list = NULL; 1753 tp->ucopy.pinned_list = NULL;
@@ -2042,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2042 __skb_queue_purge(&sk->sk_async_wait_queue); 2068 __skb_queue_purge(&sk->sk_async_wait_queue);
2043#endif 2069#endif
2044 2070
2045 inet->dport = 0; 2071 inet->inet_dport = 0;
2046 2072
2047 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 2073 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2048 inet_reset_saddr(sk); 2074 inet_reset_saddr(sk);
@@ -2059,6 +2085,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2059 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 2085 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2060 tp->snd_cwnd_cnt = 0; 2086 tp->snd_cwnd_cnt = 0;
2061 tp->bytes_acked = 0; 2087 tp->bytes_acked = 0;
2088 tp->window_clamp = 0;
2062 tcp_set_ca_state(sk, TCP_CA_Open); 2089 tcp_set_ca_state(sk, TCP_CA_Open);
2063 tcp_clear_retrans(tp); 2090 tcp_clear_retrans(tp);
2064 inet_csk_delack_init(sk); 2091 inet_csk_delack_init(sk);
@@ -2066,7 +2093,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2066 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2093 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2067 __sk_dst_reset(sk); 2094 __sk_dst_reset(sk);
2068 2095
2069 WARN_ON(inet->num && !icsk->icsk_bind_hash); 2096 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2070 2097
2071 sk->sk_error_report(sk); 2098 sk->sk_error_report(sk);
2072 return err; 2099 return err;
@@ -2083,8 +2110,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2083 int val; 2110 int val;
2084 int err = 0; 2111 int err = 0;
2085 2112
2086 /* This is a string value all the others are int's */ 2113 /* These are data/string values, all the others are ints */
2087 if (optname == TCP_CONGESTION) { 2114 switch (optname) {
2115 case TCP_CONGESTION: {
2088 char name[TCP_CA_NAME_MAX]; 2116 char name[TCP_CA_NAME_MAX];
2089 2117
2090 if (optlen < 1) 2118 if (optlen < 1)
@@ -2101,6 +2129,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2101 release_sock(sk); 2129 release_sock(sk);
2102 return err; 2130 return err;
2103 } 2131 }
2132 case TCP_COOKIE_TRANSACTIONS: {
2133 struct tcp_cookie_transactions ctd;
2134 struct tcp_cookie_values *cvp = NULL;
2135
2136 if (sizeof(ctd) > optlen)
2137 return -EINVAL;
2138 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2139 return -EFAULT;
2140
2141 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2142 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2143 return -EINVAL;
2144
2145 if (ctd.tcpct_cookie_desired == 0) {
2146 /* default to global value */
2147 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2148 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2149 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2150 return -EINVAL;
2151 }
2152
2153 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2154 /* Supercedes all other values */
2155 lock_sock(sk);
2156 if (tp->cookie_values != NULL) {
2157 kref_put(&tp->cookie_values->kref,
2158 tcp_cookie_values_release);
2159 tp->cookie_values = NULL;
2160 }
2161 tp->rx_opt.cookie_in_always = 0; /* false */
2162 tp->rx_opt.cookie_out_never = 1; /* true */
2163 release_sock(sk);
2164 return err;
2165 }
2166
2167 /* Allocate ancillary memory before locking.
2168 */
2169 if (ctd.tcpct_used > 0 ||
2170 (tp->cookie_values == NULL &&
2171 (sysctl_tcp_cookie_size > 0 ||
2172 ctd.tcpct_cookie_desired > 0 ||
2173 ctd.tcpct_s_data_desired > 0))) {
2174 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2175 GFP_KERNEL);
2176 if (cvp == NULL)
2177 return -ENOMEM;
2178 }
2179 lock_sock(sk);
2180 tp->rx_opt.cookie_in_always =
2181 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2182 tp->rx_opt.cookie_out_never = 0; /* false */
2183
2184 if (tp->cookie_values != NULL) {
2185 if (cvp != NULL) {
2186 /* Changed values are recorded by a changed
2187 * pointer, ensuring the cookie will differ,
2188 * without separately hashing each value later.
2189 */
2190 kref_put(&tp->cookie_values->kref,
2191 tcp_cookie_values_release);
2192 kref_init(&cvp->kref);
2193 tp->cookie_values = cvp;
2194 } else {
2195 cvp = tp->cookie_values;
2196 }
2197 }
2198 if (cvp != NULL) {
2199 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2200
2201 if (ctd.tcpct_used > 0) {
2202 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2203 ctd.tcpct_used);
2204 cvp->s_data_desired = ctd.tcpct_used;
2205 cvp->s_data_constant = 1; /* true */
2206 } else {
2207 /* No constant payload data. */
2208 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2209 cvp->s_data_constant = 0; /* false */
2210 }
2211 }
2212 release_sock(sk);
2213 return err;
2214 }
2215 default:
2216 /* fallthru */
2217 break;
2218 };
2104 2219
2105 if (optlen < sizeof(int)) 2220 if (optlen < sizeof(int))
2106 return -EINVAL; 2221 return -EINVAL;
@@ -2139,6 +2254,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2139 } 2254 }
2140 break; 2255 break;
2141 2256
2257 case TCP_THIN_LINEAR_TIMEOUTS:
2258 if (val < 0 || val > 1)
2259 err = -EINVAL;
2260 else
2261 tp->thin_lto = val;
2262 break;
2263
2264 case TCP_THIN_DUPACK:
2265 if (val < 0 || val > 1)
2266 err = -EINVAL;
2267 else
2268 tp->thin_dupack = val;
2269 break;
2270
2142 case TCP_CORK: 2271 case TCP_CORK:
2143 /* When set indicates to always queue non-full frames. 2272 /* When set indicates to always queue non-full frames.
2144 * Later the user clears this option and we transmit 2273 * Later the user clears this option and we transmit
@@ -2425,6 +2554,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2425 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) 2554 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2426 return -EFAULT; 2555 return -EFAULT;
2427 return 0; 2556 return 0;
2557
2558 case TCP_COOKIE_TRANSACTIONS: {
2559 struct tcp_cookie_transactions ctd;
2560 struct tcp_cookie_values *cvp = tp->cookie_values;
2561
2562 if (get_user(len, optlen))
2563 return -EFAULT;
2564 if (len < sizeof(ctd))
2565 return -EINVAL;
2566
2567 memset(&ctd, 0, sizeof(ctd));
2568 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2569 TCP_COOKIE_IN_ALWAYS : 0)
2570 | (tp->rx_opt.cookie_out_never ?
2571 TCP_COOKIE_OUT_NEVER : 0);
2572
2573 if (cvp != NULL) {
2574 ctd.tcpct_flags |= (cvp->s_data_in ?
2575 TCP_S_DATA_IN : 0)
2576 | (cvp->s_data_out ?
2577 TCP_S_DATA_OUT : 0);
2578
2579 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2580 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2581
2582 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2583 cvp->cookie_pair_size);
2584 ctd.tcpct_used = cvp->cookie_pair_size;
2585 }
2586
2587 if (put_user(sizeof(ctd), optlen))
2588 return -EFAULT;
2589 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2590 return -EFAULT;
2591 return 0;
2592 }
2428 default: 2593 default:
2429 return -ENOPROTOOPT; 2594 return -ENOPROTOOPT;
2430 } 2595 }
@@ -2662,10 +2827,10 @@ EXPORT_SYMBOL(tcp_gro_complete);
2662 2827
2663#ifdef CONFIG_TCP_MD5SIG 2828#ifdef CONFIG_TCP_MD5SIG
2664static unsigned long tcp_md5sig_users; 2829static unsigned long tcp_md5sig_users;
2665static struct tcp_md5sig_pool **tcp_md5sig_pool; 2830static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
2666static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2831static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2667 2832
2668static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) 2833static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
2669{ 2834{
2670 int cpu; 2835 int cpu;
2671 for_each_possible_cpu(cpu) { 2836 for_each_possible_cpu(cpu) {
@@ -2674,7 +2839,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2674 if (p->md5_desc.tfm) 2839 if (p->md5_desc.tfm)
2675 crypto_free_hash(p->md5_desc.tfm); 2840 crypto_free_hash(p->md5_desc.tfm);
2676 kfree(p); 2841 kfree(p);
2677 p = NULL;
2678 } 2842 }
2679 } 2843 }
2680 free_percpu(pool); 2844 free_percpu(pool);
@@ -2682,7 +2846,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2682 2846
2683void tcp_free_md5sig_pool(void) 2847void tcp_free_md5sig_pool(void)
2684{ 2848{
2685 struct tcp_md5sig_pool **pool = NULL; 2849 struct tcp_md5sig_pool * __percpu *pool = NULL;
2686 2850
2687 spin_lock_bh(&tcp_md5sig_pool_lock); 2851 spin_lock_bh(&tcp_md5sig_pool_lock);
2688 if (--tcp_md5sig_users == 0) { 2852 if (--tcp_md5sig_users == 0) {
@@ -2696,10 +2860,11 @@ void tcp_free_md5sig_pool(void)
2696 2860
2697EXPORT_SYMBOL(tcp_free_md5sig_pool); 2861EXPORT_SYMBOL(tcp_free_md5sig_pool);
2698 2862
2699static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk) 2863static struct tcp_md5sig_pool * __percpu *
2864__tcp_alloc_md5sig_pool(struct sock *sk)
2700{ 2865{
2701 int cpu; 2866 int cpu;
2702 struct tcp_md5sig_pool **pool; 2867 struct tcp_md5sig_pool * __percpu *pool;
2703 2868
2704 pool = alloc_percpu(struct tcp_md5sig_pool *); 2869 pool = alloc_percpu(struct tcp_md5sig_pool *);
2705 if (!pool) 2870 if (!pool)
@@ -2726,9 +2891,9 @@ out_free:
2726 return NULL; 2891 return NULL;
2727} 2892}
2728 2893
2729struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk) 2894struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
2730{ 2895{
2731 struct tcp_md5sig_pool **pool; 2896 struct tcp_md5sig_pool * __percpu *pool;
2732 int alloc = 0; 2897 int alloc = 0;
2733 2898
2734retry: 2899retry:
@@ -2747,7 +2912,9 @@ retry:
2747 2912
2748 if (alloc) { 2913 if (alloc) {
2749 /* we cannot hold spinlock here because this may sleep. */ 2914 /* we cannot hold spinlock here because this may sleep. */
2750 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk); 2915 struct tcp_md5sig_pool * __percpu *p;
2916
2917 p = __tcp_alloc_md5sig_pool(sk);
2751 spin_lock_bh(&tcp_md5sig_pool_lock); 2918 spin_lock_bh(&tcp_md5sig_pool_lock);
2752 if (!p) { 2919 if (!p) {
2753 tcp_md5sig_users--; 2920 tcp_md5sig_users--;
@@ -2769,25 +2936,40 @@ retry:
2769 2936
2770EXPORT_SYMBOL(tcp_alloc_md5sig_pool); 2937EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2771 2938
2772struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) 2939
2940/**
2941 * tcp_get_md5sig_pool - get md5sig_pool for this user
2942 *
2943 * We use percpu structure, so if we succeed, we exit with preemption
2944 * and BH disabled, to make sure another thread or softirq handling
2945 * wont try to get same context.
2946 */
2947struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2773{ 2948{
2774 struct tcp_md5sig_pool **p; 2949 struct tcp_md5sig_pool * __percpu *p;
2775 spin_lock_bh(&tcp_md5sig_pool_lock); 2950
2951 local_bh_disable();
2952
2953 spin_lock(&tcp_md5sig_pool_lock);
2776 p = tcp_md5sig_pool; 2954 p = tcp_md5sig_pool;
2777 if (p) 2955 if (p)
2778 tcp_md5sig_users++; 2956 tcp_md5sig_users++;
2779 spin_unlock_bh(&tcp_md5sig_pool_lock); 2957 spin_unlock(&tcp_md5sig_pool_lock);
2780 return (p ? *per_cpu_ptr(p, cpu) : NULL);
2781}
2782 2958
2783EXPORT_SYMBOL(__tcp_get_md5sig_pool); 2959 if (p)
2960 return *per_cpu_ptr(p, smp_processor_id());
2961
2962 local_bh_enable();
2963 return NULL;
2964}
2965EXPORT_SYMBOL(tcp_get_md5sig_pool);
2784 2966
2785void __tcp_put_md5sig_pool(void) 2967void tcp_put_md5sig_pool(void)
2786{ 2968{
2969 local_bh_enable();
2787 tcp_free_md5sig_pool(); 2970 tcp_free_md5sig_pool();
2788} 2971}
2789 2972EXPORT_SYMBOL(tcp_put_md5sig_pool);
2790EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2791 2973
2792int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 2974int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2793 struct tcphdr *th) 2975 struct tcphdr *th)
@@ -2847,6 +3029,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
2847 3029
2848#endif 3030#endif
2849 3031
3032/**
3033 * Each Responder maintains up to two secret values concurrently for
3034 * efficient secret rollover. Each secret value has 4 states:
3035 *
3036 * Generating. (tcp_secret_generating != tcp_secret_primary)
3037 * Generates new Responder-Cookies, but not yet used for primary
3038 * verification. This is a short-term state, typically lasting only
3039 * one round trip time (RTT).
3040 *
3041 * Primary. (tcp_secret_generating == tcp_secret_primary)
3042 * Used both for generation and primary verification.
3043 *
3044 * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
3045 * Used for verification, until the first failure that can be
3046 * verified by the newer Generating secret. At that time, this
3047 * cookie's state is changed to Secondary, and the Generating
3048 * cookie's state is changed to Primary. This is a short-term state,
3049 * typically lasting only one round trip time (RTT).
3050 *
3051 * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
3052 * Used for secondary verification, after primary verification
3053 * failures. This state lasts no more than twice the Maximum Segment
3054 * Lifetime (2MSL). Then, the secret is discarded.
3055 */
3056struct tcp_cookie_secret {
3057 /* The secret is divided into two parts. The digest part is the
3058 * equivalent of previously hashing a secret and saving the state,
3059 * and serves as an initialization vector (IV). The message part
3060 * serves as the trailing secret.
3061 */
3062 u32 secrets[COOKIE_WORKSPACE_WORDS];
3063 unsigned long expires;
3064};
3065
3066#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3067#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3068#define TCP_SECRET_LIFE (HZ * 600)
3069
3070static struct tcp_cookie_secret tcp_secret_one;
3071static struct tcp_cookie_secret tcp_secret_two;
3072
3073/* Essentially a circular list, without dynamic allocation. */
3074static struct tcp_cookie_secret *tcp_secret_generating;
3075static struct tcp_cookie_secret *tcp_secret_primary;
3076static struct tcp_cookie_secret *tcp_secret_retiring;
3077static struct tcp_cookie_secret *tcp_secret_secondary;
3078
3079static DEFINE_SPINLOCK(tcp_secret_locker);
3080
3081/* Select a pseudo-random word in the cookie workspace.
3082 */
3083static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3084{
3085 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3086}
3087
3088/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
3089 * Called in softirq context.
3090 * Returns: 0 for success.
3091 */
3092int tcp_cookie_generator(u32 *bakery)
3093{
3094 unsigned long jiffy = jiffies;
3095
3096 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3097 spin_lock_bh(&tcp_secret_locker);
3098 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3099 /* refreshed by another */
3100 memcpy(bakery,
3101 &tcp_secret_generating->secrets[0],
3102 COOKIE_WORKSPACE_WORDS);
3103 } else {
3104 /* still needs refreshing */
3105 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3106
3107 /* The first time, paranoia assumes that the
3108 * randomization function isn't as strong. But,
3109 * this secret initialization is delayed until
3110 * the last possible moment (packet arrival).
3111 * Although that time is observable, it is
3112 * unpredictably variable. Mash in the most
3113 * volatile clock bits available, and expire the
3114 * secret extra quickly.
3115 */
3116 if (unlikely(tcp_secret_primary->expires ==
3117 tcp_secret_secondary->expires)) {
3118 struct timespec tv;
3119
3120 getnstimeofday(&tv);
3121 bakery[COOKIE_DIGEST_WORDS+0] ^=
3122 (u32)tv.tv_nsec;
3123
3124 tcp_secret_secondary->expires = jiffy
3125 + TCP_SECRET_1MSL
3126 + (0x0f & tcp_cookie_work(bakery, 0));
3127 } else {
3128 tcp_secret_secondary->expires = jiffy
3129 + TCP_SECRET_LIFE
3130 + (0xff & tcp_cookie_work(bakery, 1));
3131 tcp_secret_primary->expires = jiffy
3132 + TCP_SECRET_2MSL
3133 + (0x1f & tcp_cookie_work(bakery, 2));
3134 }
3135 memcpy(&tcp_secret_secondary->secrets[0],
3136 bakery, COOKIE_WORKSPACE_WORDS);
3137
3138 rcu_assign_pointer(tcp_secret_generating,
3139 tcp_secret_secondary);
3140 rcu_assign_pointer(tcp_secret_retiring,
3141 tcp_secret_primary);
3142 /*
3143 * Neither call_rcu() nor synchronize_rcu() needed.
3144 * Retiring data is not freed. It is replaced after
3145 * further (locked) pointer updates, and a quiet time
3146 * (minimum 1MSL, maximum LIFE - 2MSL).
3147 */
3148 }
3149 spin_unlock_bh(&tcp_secret_locker);
3150 } else {
3151 rcu_read_lock_bh();
3152 memcpy(bakery,
3153 &rcu_dereference(tcp_secret_generating)->secrets[0],
3154 COOKIE_WORKSPACE_WORDS);
3155 rcu_read_unlock_bh();
3156 }
3157 return 0;
3158}
3159EXPORT_SYMBOL(tcp_cookie_generator);
3160
2850void tcp_done(struct sock *sk) 3161void tcp_done(struct sock *sk)
2851{ 3162{
2852 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3163 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2881,6 +3192,7 @@ void __init tcp_init(void)
2881 struct sk_buff *skb = NULL; 3192 struct sk_buff *skb = NULL;
2882 unsigned long nr_pages, limit; 3193 unsigned long nr_pages, limit;
2883 int order, i, max_share; 3194 int order, i, max_share;
3195 unsigned long jiffy = jiffies;
2884 3196
2885 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3197 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2886 3198
@@ -2903,11 +3215,10 @@ void __init tcp_init(void)
2903 (totalram_pages >= 128 * 1024) ? 3215 (totalram_pages >= 128 * 1024) ?
2904 13 : 15, 3216 13 : 15,
2905 0, 3217 0,
2906 &tcp_hashinfo.ehash_size,
2907 NULL, 3218 NULL,
3219 &tcp_hashinfo.ehash_mask,
2908 thash_entries ? 0 : 512 * 1024); 3220 thash_entries ? 0 : 512 * 1024);
2909 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 3221 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
2910 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2911 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 3222 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2912 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); 3223 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2913 } 3224 }
@@ -2916,7 +3227,7 @@ void __init tcp_init(void)
2916 tcp_hashinfo.bhash = 3227 tcp_hashinfo.bhash =
2917 alloc_large_system_hash("TCP bind", 3228 alloc_large_system_hash("TCP bind",
2918 sizeof(struct inet_bind_hashbucket), 3229 sizeof(struct inet_bind_hashbucket),
2919 tcp_hashinfo.ehash_size, 3230 tcp_hashinfo.ehash_mask + 1,
2920 (totalram_pages >= 128 * 1024) ? 3231 (totalram_pages >= 128 * 1024) ?
2921 13 : 15, 3232 13 : 15,
2922 0, 3233 0,
@@ -2971,10 +3282,19 @@ void __init tcp_init(void)
2971 sysctl_tcp_rmem[2] = max(87380, max_share); 3282 sysctl_tcp_rmem[2] = max(87380, max_share);
2972 3283
2973 printk(KERN_INFO "TCP: Hash tables configured " 3284 printk(KERN_INFO "TCP: Hash tables configured "
2974 "(established %d bind %d)\n", 3285 "(established %u bind %u)\n",
2975 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); 3286 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
2976 3287
2977 tcp_register_congestion_control(&tcp_reno); 3288 tcp_register_congestion_control(&tcp_reno);
3289
3290 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
3291 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
3292 tcp_secret_one.expires = jiffy; /* past due */
3293 tcp_secret_two.expires = jiffy; /* past due */
3294 tcp_secret_generating = &tcp_secret_one;
3295 tcp_secret_primary = &tcp_secret_one;
3296 tcp_secret_retiring = &tcp_secret_two;
3297 tcp_secret_secondary = &tcp_secret_two;
2978} 3298}
2979 3299
2980EXPORT_SYMBOL(tcp_close); 3300EXPORT_SYMBOL(tcp_close);