aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c305
1 files changed, 180 insertions, 125 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b0a26bb25e2e..f115ea68a4ef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -265,6 +265,7 @@
265#include <linux/err.h> 265#include <linux/err.h>
266#include <linux/crypto.h> 266#include <linux/crypto.h>
267#include <linux/time.h> 267#include <linux/time.h>
268#include <linux/slab.h>
268 269
269#include <net/icmp.h> 270#include <net/icmp.h>
270#include <net/tcp.h> 271#include <net/tcp.h>
@@ -314,7 +315,6 @@ struct tcp_splice_state {
314 * is strict, actions are advisory and have some latency. 315 * is strict, actions are advisory and have some latency.
315 */ 316 */
316int tcp_memory_pressure __read_mostly; 317int tcp_memory_pressure __read_mostly;
317
318EXPORT_SYMBOL(tcp_memory_pressure); 318EXPORT_SYMBOL(tcp_memory_pressure);
319 319
320void tcp_enter_memory_pressure(struct sock *sk) 320void tcp_enter_memory_pressure(struct sock *sk)
@@ -324,7 +324,6 @@ void tcp_enter_memory_pressure(struct sock *sk)
324 tcp_memory_pressure = 1; 324 tcp_memory_pressure = 1;
325 } 325 }
326} 326}
327
328EXPORT_SYMBOL(tcp_enter_memory_pressure); 327EXPORT_SYMBOL(tcp_enter_memory_pressure);
329 328
330/* Convert seconds to retransmits based on initial and max timeout */ 329/* Convert seconds to retransmits based on initial and max timeout */
@@ -377,7 +376,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
377 struct sock *sk = sock->sk; 376 struct sock *sk = sock->sk;
378 struct tcp_sock *tp = tcp_sk(sk); 377 struct tcp_sock *tp = tcp_sk(sk);
379 378
380 sock_poll_wait(file, sk->sk_sleep, wait); 379 sock_poll_wait(file, sk_sleep(sk), wait);
381 if (sk->sk_state == TCP_LISTEN) 380 if (sk->sk_state == TCP_LISTEN)
382 return inet_csk_listen_poll(sk); 381 return inet_csk_listen_poll(sk);
383 382
@@ -387,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
387 */ 386 */
388 387
389 mask = 0; 388 mask = 0;
390 if (sk->sk_err)
391 mask = POLLERR;
392 389
393 /* 390 /*
394 * POLLHUP is certainly not done right. But poll() doesn't 391 * POLLHUP is certainly not done right. But poll() doesn't
@@ -429,7 +426,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
429 if (tp->urg_seq == tp->copied_seq && 426 if (tp->urg_seq == tp->copied_seq &&
430 !sock_flag(sk, SOCK_URGINLINE) && 427 !sock_flag(sk, SOCK_URGINLINE) &&
431 tp->urg_data) 428 tp->urg_data)
432 target--; 429 target++;
433 430
434 /* Potential race condition. If read of tp below will 431 /* Potential race condition. If read of tp below will
435 * escape above sk->sk_state, we can be illegally awaken 432 * escape above sk->sk_state, we can be illegally awaken
@@ -452,13 +449,20 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
452 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 449 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
453 mask |= POLLOUT | POLLWRNORM; 450 mask |= POLLOUT | POLLWRNORM;
454 } 451 }
455 } 452 } else
453 mask |= POLLOUT | POLLWRNORM;
456 454
457 if (tp->urg_data & TCP_URG_VALID) 455 if (tp->urg_data & TCP_URG_VALID)
458 mask |= POLLPRI; 456 mask |= POLLPRI;
459 } 457 }
458 /* This barrier is coupled with smp_wmb() in tcp_reset() */
459 smp_rmb();
460 if (sk->sk_err)
461 mask |= POLLERR;
462
460 return mask; 463 return mask;
461} 464}
465EXPORT_SYMBOL(tcp_poll);
462 466
463int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 467int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
464{ 468{
@@ -507,10 +511,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
507 511
508 return put_user(answ, (int __user *)arg); 512 return put_user(answ, (int __user *)arg);
509} 513}
514EXPORT_SYMBOL(tcp_ioctl);
510 515
511static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 516static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
512{ 517{
513 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 518 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
514 tp->pushed_seq = tp->write_seq; 519 tp->pushed_seq = tp->write_seq;
515} 520}
516 521
@@ -526,7 +531,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
526 531
527 skb->csum = 0; 532 skb->csum = 0;
528 tcb->seq = tcb->end_seq = tp->write_seq; 533 tcb->seq = tcb->end_seq = tp->write_seq;
529 tcb->flags = TCPCB_FLAG_ACK; 534 tcb->flags = TCPHDR_ACK;
530 tcb->sacked = 0; 535 tcb->sacked = 0;
531 skb_header_release(skb); 536 skb_header_release(skb);
532 tcp_add_write_queue_tail(sk, skb); 537 tcp_add_write_queue_tail(sk, skb);
@@ -536,8 +541,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
536 tp->nonagle &= ~TCP_NAGLE_PUSH; 541 tp->nonagle &= ~TCP_NAGLE_PUSH;
537} 542}
538 543
539static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, 544static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
540 struct sk_buff *skb)
541{ 545{
542 if (flags & MSG_OOB) 546 if (flags & MSG_OOB)
543 tp->snd_up = tp->write_seq; 547 tp->snd_up = tp->write_seq;
@@ -546,13 +550,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
546static inline void tcp_push(struct sock *sk, int flags, int mss_now, 550static inline void tcp_push(struct sock *sk, int flags, int mss_now,
547 int nonagle) 551 int nonagle)
548{ 552{
549 struct tcp_sock *tp = tcp_sk(sk);
550
551 if (tcp_send_head(sk)) { 553 if (tcp_send_head(sk)) {
552 struct sk_buff *skb = tcp_write_queue_tail(sk); 554 struct tcp_sock *tp = tcp_sk(sk);
555
553 if (!(flags & MSG_MORE) || forced_push(tp)) 556 if (!(flags & MSG_MORE) || forced_push(tp))
554 tcp_mark_push(tp, skb); 557 tcp_mark_push(tp, tcp_write_queue_tail(sk));
555 tcp_mark_urg(tp, flags, skb); 558
559 tcp_mark_urg(tp, flags);
556 __tcp_push_pending_frames(sk, mss_now, 560 __tcp_push_pending_frames(sk, mss_now,
557 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 561 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
558 } 562 }
@@ -608,6 +612,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
608 ssize_t spliced; 612 ssize_t spliced;
609 int ret; 613 int ret;
610 614
615 sock_rps_record_flow(sk);
611 /* 616 /*
612 * We can't seek on a socket input 617 * We can't seek on a socket input
613 */ 618 */
@@ -675,6 +680,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
675 680
676 return ret; 681 return ret;
677} 682}
683EXPORT_SYMBOL(tcp_splice_read);
678 684
679struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) 685struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
680{ 686{
@@ -815,7 +821,7 @@ new_segment:
815 skb_shinfo(skb)->gso_segs = 0; 821 skb_shinfo(skb)->gso_segs = 0;
816 822
817 if (!copied) 823 if (!copied)
818 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 824 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
819 825
820 copied += copy; 826 copied += copy;
821 poffset += copy; 827 poffset += copy;
@@ -856,15 +862,15 @@ out_err:
856 return sk_stream_error(sk, flags, err); 862 return sk_stream_error(sk, flags, err);
857} 863}
858 864
859ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, 865int tcp_sendpage(struct sock *sk, struct page *page, int offset,
860 size_t size, int flags) 866 size_t size, int flags)
861{ 867{
862 ssize_t res; 868 ssize_t res;
863 struct sock *sk = sock->sk;
864 869
865 if (!(sk->sk_route_caps & NETIF_F_SG) || 870 if (!(sk->sk_route_caps & NETIF_F_SG) ||
866 !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) 871 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
867 return sock_no_sendpage(sock, page, offset, size, flags); 872 return sock_no_sendpage(sk->sk_socket, page, offset, size,
873 flags);
868 874
869 lock_sock(sk); 875 lock_sock(sk);
870 TCP_CHECK_TIMER(sk); 876 TCP_CHECK_TIMER(sk);
@@ -873,16 +879,17 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
873 release_sock(sk); 879 release_sock(sk);
874 return res; 880 return res;
875} 881}
882EXPORT_SYMBOL(tcp_sendpage);
876 883
877#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 884#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
878#define TCP_OFF(sk) (sk->sk_sndmsg_off) 885#define TCP_OFF(sk) (sk->sk_sndmsg_off)
879 886
880static inline int select_size(struct sock *sk) 887static inline int select_size(struct sock *sk, int sg)
881{ 888{
882 struct tcp_sock *tp = tcp_sk(sk); 889 struct tcp_sock *tp = tcp_sk(sk);
883 int tmp = tp->mss_cache; 890 int tmp = tp->mss_cache;
884 891
885 if (sk->sk_route_caps & NETIF_F_SG) { 892 if (sg) {
886 if (sk_can_gso(sk)) 893 if (sk_can_gso(sk))
887 tmp = 0; 894 tmp = 0;
888 else { 895 else {
@@ -897,16 +904,15 @@ static inline int select_size(struct sock *sk)
897 return tmp; 904 return tmp;
898} 905}
899 906
900int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 907int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
901 size_t size) 908 size_t size)
902{ 909{
903 struct sock *sk = sock->sk;
904 struct iovec *iov; 910 struct iovec *iov;
905 struct tcp_sock *tp = tcp_sk(sk); 911 struct tcp_sock *tp = tcp_sk(sk);
906 struct sk_buff *skb; 912 struct sk_buff *skb;
907 int iovlen, flags; 913 int iovlen, flags;
908 int mss_now, size_goal; 914 int mss_now, size_goal;
909 int err, copied; 915 int sg, err, copied;
910 long timeo; 916 long timeo;
911 917
912 lock_sock(sk); 918 lock_sock(sk);
@@ -934,8 +940,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
934 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 940 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
935 goto out_err; 941 goto out_err;
936 942
943 sg = sk->sk_route_caps & NETIF_F_SG;
944
937 while (--iovlen >= 0) { 945 while (--iovlen >= 0) {
938 int seglen = iov->iov_len; 946 size_t seglen = iov->iov_len;
939 unsigned char __user *from = iov->iov_base; 947 unsigned char __user *from = iov->iov_base;
940 948
941 iov++; 949 iov++;
@@ -959,8 +967,9 @@ new_segment:
959 if (!sk_stream_memory_free(sk)) 967 if (!sk_stream_memory_free(sk))
960 goto wait_for_sndbuf; 968 goto wait_for_sndbuf;
961 969
962 skb = sk_stream_alloc_skb(sk, select_size(sk), 970 skb = sk_stream_alloc_skb(sk,
963 sk->sk_allocation); 971 select_size(sk, sg),
972 sk->sk_allocation);
964 if (!skb) 973 if (!skb)
965 goto wait_for_memory; 974 goto wait_for_memory;
966 975
@@ -997,9 +1006,7 @@ new_segment:
997 /* We can extend the last page 1006 /* We can extend the last page
998 * fragment. */ 1007 * fragment. */
999 merge = 1; 1008 merge = 1;
1000 } else if (i == MAX_SKB_FRAGS || 1009 } else if (i == MAX_SKB_FRAGS || !sg) {
1001 (!i &&
1002 !(sk->sk_route_caps & NETIF_F_SG))) {
1003 /* Need to add new fragment and cannot 1010 /* Need to add new fragment and cannot
1004 * do this because interface is non-SG, 1011 * do this because interface is non-SG,
1005 * or because all the page slots are 1012 * or because all the page slots are
@@ -1060,7 +1067,7 @@ new_segment:
1060 } 1067 }
1061 1068
1062 if (!copied) 1069 if (!copied)
1063 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 1070 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
1064 1071
1065 tp->write_seq += copy; 1072 tp->write_seq += copy;
1066 TCP_SKB_CB(skb)->end_seq += copy; 1073 TCP_SKB_CB(skb)->end_seq += copy;
@@ -1120,6 +1127,7 @@ out_err:
1120 release_sock(sk); 1127 release_sock(sk);
1121 return err; 1128 return err;
1122} 1129}
1130EXPORT_SYMBOL(tcp_sendmsg);
1123 1131
1124/* 1132/*
1125 * Handle reading urgent data. BSD has very simple semantics for 1133 * Handle reading urgent data. BSD has very simple semantics for
@@ -1254,6 +1262,39 @@ static void tcp_prequeue_process(struct sock *sk)
1254 tp->ucopy.memory = 0; 1262 tp->ucopy.memory = 0;
1255} 1263}
1256 1264
1265#ifdef CONFIG_NET_DMA
1266static void tcp_service_net_dma(struct sock *sk, bool wait)
1267{
1268 dma_cookie_t done, used;
1269 dma_cookie_t last_issued;
1270 struct tcp_sock *tp = tcp_sk(sk);
1271
1272 if (!tp->ucopy.dma_chan)
1273 return;
1274
1275 last_issued = tp->ucopy.dma_cookie;
1276 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1277
1278 do {
1279 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1280 last_issued, &done,
1281 &used) == DMA_SUCCESS) {
1282 /* Safe to free early-copied skbs now */
1283 __skb_queue_purge(&sk->sk_async_wait_queue);
1284 break;
1285 } else {
1286 struct sk_buff *skb;
1287 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1288 (dma_async_is_complete(skb->dma_cookie, done,
1289 used) == DMA_SUCCESS)) {
1290 __skb_dequeue(&sk->sk_async_wait_queue);
1291 kfree_skb(skb);
1292 }
1293 }
1294 } while (wait);
1295}
1296#endif
1297
1257static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1298static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1258{ 1299{
1259 struct sk_buff *skb; 1300 struct sk_buff *skb;
@@ -1335,6 +1376,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1335 sk_eat_skb(sk, skb, 0); 1376 sk_eat_skb(sk, skb, 0);
1336 if (!desc->count) 1377 if (!desc->count)
1337 break; 1378 break;
1379 tp->copied_seq = seq;
1338 } 1380 }
1339 tp->copied_seq = seq; 1381 tp->copied_seq = seq;
1340 1382
@@ -1345,6 +1387,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1345 tcp_cleanup_rbuf(sk, copied); 1387 tcp_cleanup_rbuf(sk, copied);
1346 return copied; 1388 return copied;
1347} 1389}
1390EXPORT_SYMBOL(tcp_read_sock);
1348 1391
1349/* 1392/*
1350 * This routine copies from a sock struct into the user buffer. 1393 * This routine copies from a sock struct into the user buffer.
@@ -1546,6 +1589,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1546 /* __ Set realtime policy in scheduler __ */ 1589 /* __ Set realtime policy in scheduler __ */
1547 } 1590 }
1548 1591
1592#ifdef CONFIG_NET_DMA
1593 if (tp->ucopy.dma_chan)
1594 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1595#endif
1549 if (copied >= target) { 1596 if (copied >= target) {
1550 /* Do not sleep, just process backlog. */ 1597 /* Do not sleep, just process backlog. */
1551 release_sock(sk); 1598 release_sock(sk);
@@ -1554,6 +1601,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1554 sk_wait_data(sk, &timeo); 1601 sk_wait_data(sk, &timeo);
1555 1602
1556#ifdef CONFIG_NET_DMA 1603#ifdef CONFIG_NET_DMA
1604 tcp_service_net_dma(sk, false); /* Don't block */
1557 tp->ucopy.wakeup = 0; 1605 tp->ucopy.wakeup = 0;
1558#endif 1606#endif
1559 1607
@@ -1633,6 +1681,9 @@ do_prequeue:
1633 copied = -EFAULT; 1681 copied = -EFAULT;
1634 break; 1682 break;
1635 } 1683 }
1684
1685 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1686
1636 if ((offset + used) == skb->len) 1687 if ((offset + used) == skb->len)
1637 copied_early = 1; 1688 copied_early = 1;
1638 1689
@@ -1702,27 +1753,9 @@ skip_copy:
1702 } 1753 }
1703 1754
1704#ifdef CONFIG_NET_DMA 1755#ifdef CONFIG_NET_DMA
1705 if (tp->ucopy.dma_chan) { 1756 tcp_service_net_dma(sk, true); /* Wait for queue to drain */
1706 dma_cookie_t done, used; 1757 tp->ucopy.dma_chan = NULL;
1707
1708 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1709
1710 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1711 tp->ucopy.dma_cookie, &done,
1712 &used) == DMA_IN_PROGRESS) {
1713 /* do partial cleanup of sk_async_wait_queue */
1714 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1715 (dma_async_is_complete(skb->dma_cookie, done,
1716 used) == DMA_SUCCESS)) {
1717 __skb_dequeue(&sk->sk_async_wait_queue);
1718 kfree_skb(skb);
1719 }
1720 }
1721 1758
1722 /* Safe to free early-copied skbs now */
1723 __skb_queue_purge(&sk->sk_async_wait_queue);
1724 tp->ucopy.dma_chan = NULL;
1725 }
1726 if (tp->ucopy.pinned_list) { 1759 if (tp->ucopy.pinned_list) {
1727 dma_unpin_iovec_pages(tp->ucopy.pinned_list); 1760 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1728 tp->ucopy.pinned_list = NULL; 1761 tp->ucopy.pinned_list = NULL;
@@ -1749,6 +1782,7 @@ recv_urg:
1749 err = tcp_recv_urg(sk, msg, len, flags); 1782 err = tcp_recv_urg(sk, msg, len, flags);
1750 goto out; 1783 goto out;
1751} 1784}
1785EXPORT_SYMBOL(tcp_recvmsg);
1752 1786
1753void tcp_set_state(struct sock *sk, int state) 1787void tcp_set_state(struct sock *sk, int state)
1754{ 1788{
@@ -1841,6 +1875,7 @@ void tcp_shutdown(struct sock *sk, int how)
1841 tcp_send_fin(sk); 1875 tcp_send_fin(sk);
1842 } 1876 }
1843} 1877}
1878EXPORT_SYMBOL(tcp_shutdown);
1844 1879
1845void tcp_close(struct sock *sk, long timeout) 1880void tcp_close(struct sock *sk, long timeout)
1846{ 1881{
@@ -1873,6 +1908,10 @@ void tcp_close(struct sock *sk, long timeout)
1873 1908
1874 sk_mem_reclaim(sk); 1909 sk_mem_reclaim(sk);
1875 1910
1911 /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
1912 if (sk->sk_state == TCP_CLOSE)
1913 goto adjudge_to_death;
1914
1876 /* As outlined in RFC 2525, section 2.17, we send a RST here because 1915 /* As outlined in RFC 2525, section 2.17, we send a RST here because
1877 * data was lost. To witness the awful effects of the old behavior of 1916 * data was lost. To witness the awful effects of the old behavior of
1878 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk 1917 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
@@ -1976,11 +2015,8 @@ adjudge_to_death:
1976 } 2015 }
1977 } 2016 }
1978 if (sk->sk_state != TCP_CLOSE) { 2017 if (sk->sk_state != TCP_CLOSE) {
1979 int orphan_count = percpu_counter_read_positive(
1980 sk->sk_prot->orphan_count);
1981
1982 sk_mem_reclaim(sk); 2018 sk_mem_reclaim(sk);
1983 if (tcp_too_many_orphans(sk, orphan_count)) { 2019 if (tcp_too_many_orphans(sk, 0)) {
1984 if (net_ratelimit()) 2020 if (net_ratelimit())
1985 printk(KERN_INFO "TCP: too many of orphaned " 2021 printk(KERN_INFO "TCP: too many of orphaned "
1986 "sockets\n"); 2022 "sockets\n");
@@ -2000,6 +2036,7 @@ out:
2000 local_bh_enable(); 2036 local_bh_enable();
2001 sock_put(sk); 2037 sock_put(sk);
2002} 2038}
2039EXPORT_SYMBOL(tcp_close);
2003 2040
2004/* These states need RST on ABORT according to RFC793 */ 2041/* These states need RST on ABORT according to RFC793 */
2005 2042
@@ -2073,6 +2110,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2073 sk->sk_error_report(sk); 2110 sk->sk_error_report(sk);
2074 return err; 2111 return err;
2075} 2112}
2113EXPORT_SYMBOL(tcp_disconnect);
2076 2114
2077/* 2115/*
2078 * Socket option code for TCP. 2116 * Socket option code for TCP.
@@ -2150,6 +2188,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2150 GFP_KERNEL); 2188 GFP_KERNEL);
2151 if (cvp == NULL) 2189 if (cvp == NULL)
2152 return -ENOMEM; 2190 return -ENOMEM;
2191
2192 kref_init(&cvp->kref);
2153 } 2193 }
2154 lock_sock(sk); 2194 lock_sock(sk);
2155 tp->rx_opt.cookie_in_always = 2195 tp->rx_opt.cookie_in_always =
@@ -2164,12 +2204,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2164 */ 2204 */
2165 kref_put(&tp->cookie_values->kref, 2205 kref_put(&tp->cookie_values->kref,
2166 tcp_cookie_values_release); 2206 tcp_cookie_values_release);
2167 kref_init(&cvp->kref);
2168 tp->cookie_values = cvp;
2169 } else { 2207 } else {
2170 cvp = tp->cookie_values; 2208 cvp = tp->cookie_values;
2171 } 2209 }
2172 } 2210 }
2211
2173 if (cvp != NULL) { 2212 if (cvp != NULL) {
2174 cvp->cookie_desired = ctd.tcpct_cookie_desired; 2213 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2175 2214
@@ -2183,6 +2222,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2183 cvp->s_data_desired = ctd.tcpct_s_data_desired; 2222 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2184 cvp->s_data_constant = 0; /* false */ 2223 cvp->s_data_constant = 0; /* false */
2185 } 2224 }
2225
2226 tp->cookie_values = cvp;
2186 } 2227 }
2187 release_sock(sk); 2228 release_sock(sk);
2188 return err; 2229 return err;
@@ -2190,7 +2231,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2190 default: 2231 default:
2191 /* fallthru */ 2232 /* fallthru */
2192 break; 2233 break;
2193 }; 2234 }
2194 2235
2195 if (optlen < sizeof(int)) 2236 if (optlen < sizeof(int))
2196 return -EINVAL; 2237 return -EINVAL;
@@ -2229,6 +2270,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2229 } 2270 }
2230 break; 2271 break;
2231 2272
2273 case TCP_THIN_LINEAR_TIMEOUTS:
2274 if (val < 0 || val > 1)
2275 err = -EINVAL;
2276 else
2277 tp->thin_lto = val;
2278 break;
2279
2280 case TCP_THIN_DUPACK:
2281 if (val < 0 || val > 1)
2282 err = -EINVAL;
2283 else
2284 tp->thin_dupack = val;
2285 break;
2286
2232 case TCP_CORK: 2287 case TCP_CORK:
2233 /* When set indicates to always queue non-full frames. 2288 /* When set indicates to always queue non-full frames.
2234 * Later the user clears this option and we transmit 2289 * Later the user clears this option and we transmit
@@ -2259,7 +2314,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2259 if (sock_flag(sk, SOCK_KEEPOPEN) && 2314 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2260 !((1 << sk->sk_state) & 2315 !((1 << sk->sk_state) &
2261 (TCPF_CLOSE | TCPF_LISTEN))) { 2316 (TCPF_CLOSE | TCPF_LISTEN))) {
2262 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; 2317 u32 elapsed = keepalive_time_elapsed(tp);
2263 if (tp->keepalive_time > elapsed) 2318 if (tp->keepalive_time > elapsed)
2264 elapsed = tp->keepalive_time - elapsed; 2319 elapsed = tp->keepalive_time - elapsed;
2265 else 2320 else
@@ -2357,6 +2412,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2357 optval, optlen); 2412 optval, optlen);
2358 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2413 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2359} 2414}
2415EXPORT_SYMBOL(tcp_setsockopt);
2360 2416
2361#ifdef CONFIG_COMPAT 2417#ifdef CONFIG_COMPAT
2362int compat_tcp_setsockopt(struct sock *sk, int level, int optname, 2418int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
@@ -2367,7 +2423,6 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2367 optval, optlen); 2423 optval, optlen);
2368 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2424 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2369} 2425}
2370
2371EXPORT_SYMBOL(compat_tcp_setsockopt); 2426EXPORT_SYMBOL(compat_tcp_setsockopt);
2372#endif 2427#endif
2373 2428
@@ -2433,7 +2488,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2433 2488
2434 info->tcpi_total_retrans = tp->total_retrans; 2489 info->tcpi_total_retrans = tp->total_retrans;
2435} 2490}
2436
2437EXPORT_SYMBOL_GPL(tcp_get_info); 2491EXPORT_SYMBOL_GPL(tcp_get_info);
2438 2492
2439static int do_tcp_getsockopt(struct sock *sk, int level, 2493static int do_tcp_getsockopt(struct sock *sk, int level,
@@ -2551,6 +2605,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2551 return -EFAULT; 2605 return -EFAULT;
2552 return 0; 2606 return 0;
2553 } 2607 }
2608 case TCP_THIN_LINEAR_TIMEOUTS:
2609 val = tp->thin_lto;
2610 break;
2611 case TCP_THIN_DUPACK:
2612 val = tp->thin_dupack;
2613 break;
2554 default: 2614 default:
2555 return -ENOPROTOOPT; 2615 return -ENOPROTOOPT;
2556 } 2616 }
@@ -2572,6 +2632,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2572 optval, optlen); 2632 optval, optlen);
2573 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2633 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2574} 2634}
2635EXPORT_SYMBOL(tcp_getsockopt);
2575 2636
2576#ifdef CONFIG_COMPAT 2637#ifdef CONFIG_COMPAT
2577int compat_tcp_getsockopt(struct sock *sk, int level, int optname, 2638int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -2582,7 +2643,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2582 optval, optlen); 2643 optval, optlen);
2583 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2644 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2584} 2645}
2585
2586EXPORT_SYMBOL(compat_tcp_getsockopt); 2646EXPORT_SYMBOL(compat_tcp_getsockopt);
2587#endif 2647#endif
2588 2648
@@ -2682,7 +2742,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2682 struct tcphdr *th2; 2742 struct tcphdr *th2;
2683 unsigned int len; 2743 unsigned int len;
2684 unsigned int thlen; 2744 unsigned int thlen;
2685 unsigned int flags; 2745 __be32 flags;
2686 unsigned int mss = 1; 2746 unsigned int mss = 1;
2687 unsigned int hlen; 2747 unsigned int hlen;
2688 unsigned int off; 2748 unsigned int off;
@@ -2732,10 +2792,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2732 2792
2733found: 2793found:
2734 flush = NAPI_GRO_CB(p)->flush; 2794 flush = NAPI_GRO_CB(p)->flush;
2735 flush |= flags & TCP_FLAG_CWR; 2795 flush |= (__force int)(flags & TCP_FLAG_CWR);
2736 flush |= (flags ^ tcp_flag_word(th2)) & 2796 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
2737 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); 2797 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
2738 flush |= th->ack_seq ^ th2->ack_seq; 2798 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
2739 for (i = sizeof(*th); i < thlen; i += 4) 2799 for (i = sizeof(*th); i < thlen; i += 4)
2740 flush |= *(u32 *)((u8 *)th + i) ^ 2800 flush |= *(u32 *)((u8 *)th + i) ^
2741 *(u32 *)((u8 *)th2 + i); 2801 *(u32 *)((u8 *)th2 + i);
@@ -2756,8 +2816,9 @@ found:
2756 2816
2757out_check_final: 2817out_check_final:
2758 flush = len < mss; 2818 flush = len < mss;
2759 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | 2819 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
2760 TCP_FLAG_SYN | TCP_FLAG_FIN); 2820 TCP_FLAG_RST | TCP_FLAG_SYN |
2821 TCP_FLAG_FIN));
2761 2822
2762 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) 2823 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
2763 pp = head; 2824 pp = head;
@@ -2788,10 +2849,10 @@ EXPORT_SYMBOL(tcp_gro_complete);
2788 2849
2789#ifdef CONFIG_TCP_MD5SIG 2850#ifdef CONFIG_TCP_MD5SIG
2790static unsigned long tcp_md5sig_users; 2851static unsigned long tcp_md5sig_users;
2791static struct tcp_md5sig_pool **tcp_md5sig_pool; 2852static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
2792static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2853static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2793 2854
2794static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) 2855static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
2795{ 2856{
2796 int cpu; 2857 int cpu;
2797 for_each_possible_cpu(cpu) { 2858 for_each_possible_cpu(cpu) {
@@ -2800,7 +2861,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2800 if (p->md5_desc.tfm) 2861 if (p->md5_desc.tfm)
2801 crypto_free_hash(p->md5_desc.tfm); 2862 crypto_free_hash(p->md5_desc.tfm);
2802 kfree(p); 2863 kfree(p);
2803 p = NULL;
2804 } 2864 }
2805 } 2865 }
2806 free_percpu(pool); 2866 free_percpu(pool);
@@ -2808,7 +2868,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2808 2868
2809void tcp_free_md5sig_pool(void) 2869void tcp_free_md5sig_pool(void)
2810{ 2870{
2811 struct tcp_md5sig_pool **pool = NULL; 2871 struct tcp_md5sig_pool * __percpu *pool = NULL;
2812 2872
2813 spin_lock_bh(&tcp_md5sig_pool_lock); 2873 spin_lock_bh(&tcp_md5sig_pool_lock);
2814 if (--tcp_md5sig_users == 0) { 2874 if (--tcp_md5sig_users == 0) {
@@ -2819,13 +2879,13 @@ void tcp_free_md5sig_pool(void)
2819 if (pool) 2879 if (pool)
2820 __tcp_free_md5sig_pool(pool); 2880 __tcp_free_md5sig_pool(pool);
2821} 2881}
2822
2823EXPORT_SYMBOL(tcp_free_md5sig_pool); 2882EXPORT_SYMBOL(tcp_free_md5sig_pool);
2824 2883
2825static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk) 2884static struct tcp_md5sig_pool * __percpu *
2885__tcp_alloc_md5sig_pool(struct sock *sk)
2826{ 2886{
2827 int cpu; 2887 int cpu;
2828 struct tcp_md5sig_pool **pool; 2888 struct tcp_md5sig_pool * __percpu *pool;
2829 2889
2830 pool = alloc_percpu(struct tcp_md5sig_pool *); 2890 pool = alloc_percpu(struct tcp_md5sig_pool *);
2831 if (!pool) 2891 if (!pool)
@@ -2852,9 +2912,9 @@ out_free:
2852 return NULL; 2912 return NULL;
2853} 2913}
2854 2914
2855struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk) 2915struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
2856{ 2916{
2857 struct tcp_md5sig_pool **pool; 2917 struct tcp_md5sig_pool * __percpu *pool;
2858 int alloc = 0; 2918 int alloc = 0;
2859 2919
2860retry: 2920retry:
@@ -2873,7 +2933,9 @@ retry:
2873 2933
2874 if (alloc) { 2934 if (alloc) {
2875 /* we cannot hold spinlock here because this may sleep. */ 2935 /* we cannot hold spinlock here because this may sleep. */
2876 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk); 2936 struct tcp_md5sig_pool * __percpu *p;
2937
2938 p = __tcp_alloc_md5sig_pool(sk);
2877 spin_lock_bh(&tcp_md5sig_pool_lock); 2939 spin_lock_bh(&tcp_md5sig_pool_lock);
2878 if (!p) { 2940 if (!p) {
2879 tcp_md5sig_users--; 2941 tcp_md5sig_users--;
@@ -2892,28 +2954,42 @@ retry:
2892 } 2954 }
2893 return pool; 2955 return pool;
2894} 2956}
2895
2896EXPORT_SYMBOL(tcp_alloc_md5sig_pool); 2957EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2897 2958
2898struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) 2959
2960/**
2961 * tcp_get_md5sig_pool - get md5sig_pool for this user
2962 *
2963 * We use percpu structure, so if we succeed, we exit with preemption
2964 * and BH disabled, to make sure another thread or softirq handling
2965 * wont try to get same context.
2966 */
2967struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2899{ 2968{
2900 struct tcp_md5sig_pool **p; 2969 struct tcp_md5sig_pool * __percpu *p;
2901 spin_lock_bh(&tcp_md5sig_pool_lock); 2970
2971 local_bh_disable();
2972
2973 spin_lock(&tcp_md5sig_pool_lock);
2902 p = tcp_md5sig_pool; 2974 p = tcp_md5sig_pool;
2903 if (p) 2975 if (p)
2904 tcp_md5sig_users++; 2976 tcp_md5sig_users++;
2905 spin_unlock_bh(&tcp_md5sig_pool_lock); 2977 spin_unlock(&tcp_md5sig_pool_lock);
2906 return (p ? *per_cpu_ptr(p, cpu) : NULL); 2978
2907} 2979 if (p)
2980 return *this_cpu_ptr(p);
2908 2981
2909EXPORT_SYMBOL(__tcp_get_md5sig_pool); 2982 local_bh_enable();
2983 return NULL;
2984}
2985EXPORT_SYMBOL(tcp_get_md5sig_pool);
2910 2986
2911void __tcp_put_md5sig_pool(void) 2987void tcp_put_md5sig_pool(void)
2912{ 2988{
2989 local_bh_enable();
2913 tcp_free_md5sig_pool(); 2990 tcp_free_md5sig_pool();
2914} 2991}
2915 2992EXPORT_SYMBOL(tcp_put_md5sig_pool);
2916EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2917 2993
2918int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 2994int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2919 struct tcphdr *th) 2995 struct tcphdr *th)
@@ -2929,7 +3005,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2929 th->check = old_checksum; 3005 th->check = old_checksum;
2930 return err; 3006 return err;
2931} 3007}
2932
2933EXPORT_SYMBOL(tcp_md5_hash_header); 3008EXPORT_SYMBOL(tcp_md5_hash_header);
2934 3009
2935int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 3010int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
@@ -2942,6 +3017,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2942 const unsigned head_data_len = skb_headlen(skb) > header_len ? 3017 const unsigned head_data_len = skb_headlen(skb) > header_len ?
2943 skb_headlen(skb) - header_len : 0; 3018 skb_headlen(skb) - header_len : 0;
2944 const struct skb_shared_info *shi = skb_shinfo(skb); 3019 const struct skb_shared_info *shi = skb_shinfo(skb);
3020 struct sk_buff *frag_iter;
2945 3021
2946 sg_init_table(&sg, 1); 3022 sg_init_table(&sg, 1);
2947 3023
@@ -2956,9 +3032,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2956 return 1; 3032 return 1;
2957 } 3033 }
2958 3034
3035 skb_walk_frags(skb, frag_iter)
3036 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3037 return 1;
3038
2959 return 0; 3039 return 0;
2960} 3040}
2961
2962EXPORT_SYMBOL(tcp_md5_hash_skb_data); 3041EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2963 3042
2964int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) 3043int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
@@ -2968,7 +3047,6 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
2968 sg_init_one(&sg, key->key, key->keylen); 3047 sg_init_one(&sg, key->key, key->keylen);
2969 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); 3048 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2970} 3049}
2971
2972EXPORT_SYMBOL(tcp_md5_hash_key); 3050EXPORT_SYMBOL(tcp_md5_hash_key);
2973 3051
2974#endif 3052#endif
@@ -3135,7 +3213,7 @@ void __init tcp_init(void)
3135{ 3213{
3136 struct sk_buff *skb = NULL; 3214 struct sk_buff *skb = NULL;
3137 unsigned long nr_pages, limit; 3215 unsigned long nr_pages, limit;
3138 int order, i, max_share; 3216 int i, max_share, cnt;
3139 unsigned long jiffy = jiffies; 3217 unsigned long jiffy = jiffies;
3140 3218
3141 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3219 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3184,22 +3262,12 @@ void __init tcp_init(void)
3184 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3262 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3185 } 3263 }
3186 3264
3187 /* Try to be a bit smarter and adjust defaults depending 3265
3188 * on available memory. 3266 cnt = tcp_hashinfo.ehash_mask + 1;
3189 */ 3267
3190 for (order = 0; ((1 << order) << PAGE_SHIFT) < 3268 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3191 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); 3269 sysctl_tcp_max_orphans = cnt / 2;
3192 order++) 3270 sysctl_max_syn_backlog = max(128, cnt / 256);
3193 ;
3194 if (order >= 4) {
3195 tcp_death_row.sysctl_max_tw_buckets = 180000;
3196 sysctl_tcp_max_orphans = 4096 << (order - 4);
3197 sysctl_max_syn_backlog = 1024;
3198 } else if (order < 3) {
3199 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
3200 sysctl_tcp_max_orphans >>= (3 - order);
3201 sysctl_max_syn_backlog = 128;
3202 }
3203 3271
3204 /* Set the pressure threshold to be a fraction of global memory that 3272 /* Set the pressure threshold to be a fraction of global memory that
3205 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of 3273 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
@@ -3240,16 +3308,3 @@ void __init tcp_init(void)
3240 tcp_secret_retiring = &tcp_secret_two; 3308 tcp_secret_retiring = &tcp_secret_two;
3241 tcp_secret_secondary = &tcp_secret_two; 3309 tcp_secret_secondary = &tcp_secret_two;
3242} 3310}
3243
3244EXPORT_SYMBOL(tcp_close);
3245EXPORT_SYMBOL(tcp_disconnect);
3246EXPORT_SYMBOL(tcp_getsockopt);
3247EXPORT_SYMBOL(tcp_ioctl);
3248EXPORT_SYMBOL(tcp_poll);
3249EXPORT_SYMBOL(tcp_read_sock);
3250EXPORT_SYMBOL(tcp_recvmsg);
3251EXPORT_SYMBOL(tcp_sendmsg);
3252EXPORT_SYMBOL(tcp_splice_read);
3253EXPORT_SYMBOL(tcp_sendpage);
3254EXPORT_SYMBOL(tcp_setsockopt);
3255EXPORT_SYMBOL(tcp_shutdown);