aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ipv4/tcp.c
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c891
1 files changed, 328 insertions, 563 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2aa69c8ae60..09ced58e6a5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -245,8 +245,6 @@
245 * TCP_CLOSE socket is finished 245 * TCP_CLOSE socket is finished
246 */ 246 */
247 247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h> 248#include <linux/kernel.h>
251#include <linux/module.h> 249#include <linux/module.h>
252#include <linux/types.h> 250#include <linux/types.h>
@@ -268,12 +266,15 @@
268#include <linux/crypto.h> 266#include <linux/crypto.h>
269#include <linux/time.h> 267#include <linux/time.h>
270#include <linux/slab.h> 268#include <linux/slab.h>
269#include <linux/uid_stat.h>
271 270
272#include <net/icmp.h> 271#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h> 272#include <net/tcp.h>
275#include <net/xfrm.h> 273#include <net/xfrm.h>
276#include <net/ip.h> 274#include <net/ip.h>
275#include <net/ip6_route.h>
276#include <net/ipv6.h>
277#include <net/transp_v6.h>
277#include <net/netdma.h> 278#include <net/netdma.h>
278#include <net/sock.h> 279#include <net/sock.h>
279 280
@@ -285,9 +286,11 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285struct percpu_counter tcp_orphan_count; 286struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count); 287EXPORT_SYMBOL_GPL(tcp_orphan_count);
287 288
289long sysctl_tcp_mem[3] __read_mostly;
288int sysctl_tcp_wmem[3] __read_mostly; 290int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly; 291int sysctl_tcp_rmem[3] __read_mostly;
290 292
293EXPORT_SYMBOL(sysctl_tcp_mem);
291EXPORT_SYMBOL(sysctl_tcp_rmem); 294EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem); 295EXPORT_SYMBOL(sysctl_tcp_wmem);
293 296
@@ -364,72 +367,6 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
364 return period; 367 return period;
365} 368}
366 369
367/* Address-family independent initialization for a tcp_sock.
368 *
369 * NOTE: A lot of things set to zero explicitly by call to
370 * sk_alloc() so need not be done here.
371 */
372void tcp_init_sock(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375 struct tcp_sock *tp = tcp_sk(sk);
376
377 skb_queue_head_init(&tp->out_of_order_queue);
378 tcp_init_xmit_timers(sk);
379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
381
382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
383 tp->mdev = TCP_TIMEOUT_INIT;
384
385 /* So many TCP implementations out there (incorrectly) count the
386 * initial SYN frame in their delayed-ACK and congestion control
387 * algorithms that we must have the following bandaid to talk
388 * efficiently to them. -DaveM
389 */
390 tp->snd_cwnd = TCP_INIT_CWND;
391
392 /* See draft-stevens-tcpca-spec-01 for discussion of the
393 * initialization of these values.
394 */
395 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396 tp->snd_cwnd_clamp = ~0;
397 tp->mss_cache = TCP_MSS_DEFAULT;
398
399 tp->reordering = sysctl_tcp_reordering;
400 tcp_enable_early_retrans(tp);
401 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403 sk->sk_state = TCP_CLOSE;
404
405 sk->sk_write_space = sk_stream_write_space;
406 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
407
408 icsk->icsk_sync_mss = tcp_sync_mss;
409
410 /* TCP Cookie Transactions */
411 if (sysctl_tcp_cookie_size > 0) {
412 /* Default, cookies without s_data_payload. */
413 tp->cookie_values =
414 kzalloc(sizeof(*tp->cookie_values),
415 sk->sk_allocation);
416 if (tp->cookie_values != NULL)
417 kref_init(&tp->cookie_values->kref);
418 }
419 /* Presumed zeroed, in order of appearance:
420 * cookie_in_always, cookie_out_never,
421 * s_data_constant, s_data_in, s_data_out
422 */
423 sk->sk_sndbuf = sysctl_tcp_wmem[1];
424 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
425
426 local_bh_disable();
427 sock_update_memcg(sk);
428 sk_sockets_allocated_inc(sk);
429 local_bh_enable();
430}
431EXPORT_SYMBOL(tcp_init_sock);
432
433/* 370/*
434 * Wait for a TCP event. 371 * Wait for a TCP event.
435 * 372 *
@@ -441,7 +378,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
441{ 378{
442 unsigned int mask; 379 unsigned int mask;
443 struct sock *sk = sock->sk; 380 struct sock *sk = sock->sk;
444 const struct tcp_sock *tp = tcp_sk(sk); 381 struct tcp_sock *tp = tcp_sk(sk);
445 382
446 sock_poll_wait(file, sk_sleep(sk), wait); 383 sock_poll_wait(file, sk_sleep(sk), wait);
447 if (sk->sk_state == TCP_LISTEN) 384 if (sk->sk_state == TCP_LISTEN)
@@ -486,9 +423,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
486 if (sk->sk_shutdown & RCV_SHUTDOWN) 423 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 424 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488 425
489 /* Connected or passive Fast Open socket? */ 426 /* Connected? */
490 if (sk->sk_state != TCP_SYN_SENT && 427 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
492 int target = sock_rcvlowat(sk, 0, INT_MAX); 428 int target = sock_rcvlowat(sk, 0, INT_MAX);
493 429
494 if (tp->urg_seq == tp->copied_seq && 430 if (tp->urg_seq == tp->copied_seq &&
@@ -536,29 +472,30 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
536{ 472{
537 struct tcp_sock *tp = tcp_sk(sk); 473 struct tcp_sock *tp = tcp_sk(sk);
538 int answ; 474 int answ;
539 bool slow;
540 475
541 switch (cmd) { 476 switch (cmd) {
542 case SIOCINQ: 477 case SIOCINQ:
543 if (sk->sk_state == TCP_LISTEN) 478 if (sk->sk_state == TCP_LISTEN)
544 return -EINVAL; 479 return -EINVAL;
545 480
546 slow = lock_sock_fast(sk); 481 lock_sock(sk);
547 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 482 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
548 answ = 0; 483 answ = 0;
549 else if (sock_flag(sk, SOCK_URGINLINE) || 484 else if (sock_flag(sk, SOCK_URGINLINE) ||
550 !tp->urg_data || 485 !tp->urg_data ||
551 before(tp->urg_seq, tp->copied_seq) || 486 before(tp->urg_seq, tp->copied_seq) ||
552 !before(tp->urg_seq, tp->rcv_nxt)) { 487 !before(tp->urg_seq, tp->rcv_nxt)) {
488 struct sk_buff *skb;
553 489
554 answ = tp->rcv_nxt - tp->copied_seq; 490 answ = tp->rcv_nxt - tp->copied_seq;
555 491
556 /* Subtract 1, if FIN was received */ 492 /* Subtract 1, if FIN is in queue. */
557 if (answ && sock_flag(sk, SOCK_DONE)) 493 skb = skb_peek_tail(&sk->sk_receive_queue);
558 answ--; 494 if (answ && skb)
495 answ -= tcp_hdr(skb)->fin;
559 } else 496 } else
560 answ = tp->urg_seq - tp->copied_seq; 497 answ = tp->urg_seq - tp->copied_seq;
561 unlock_sock_fast(sk, slow); 498 release_sock(sk);
562 break; 499 break;
563 case SIOCATMARK: 500 case SIOCATMARK:
564 answ = tp->urg_data && tp->urg_seq == tp->copied_seq; 501 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
@@ -591,11 +528,11 @@ EXPORT_SYMBOL(tcp_ioctl);
591 528
592static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 529static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
593{ 530{
594 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 531 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
595 tp->pushed_seq = tp->write_seq; 532 tp->pushed_seq = tp->write_seq;
596} 533}
597 534
598static inline bool forced_push(const struct tcp_sock *tp) 535static inline int forced_push(struct tcp_sock *tp)
599{ 536{
600 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 537 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
601} 538}
@@ -607,7 +544,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
607 544
608 skb->csum = 0; 545 skb->csum = 0;
609 tcb->seq = tcb->end_seq = tp->write_seq; 546 tcb->seq = tcb->end_seq = tp->write_seq;
610 tcb->tcp_flags = TCPHDR_ACK; 547 tcb->flags = TCPHDR_ACK;
611 tcb->sacked = 0; 548 tcb->sacked = 0;
612 skb_header_release(skb); 549 skb_header_release(skb);
613 tcp_add_write_queue_tail(sk, skb); 550 tcp_add_write_queue_tail(sk, skb);
@@ -768,12 +705,11 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
768 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); 705 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
769 if (skb) { 706 if (skb) {
770 if (sk_wmem_schedule(sk, skb->truesize)) { 707 if (sk_wmem_schedule(sk, skb->truesize)) {
771 skb_reserve(skb, sk->sk_prot->max_header);
772 /* 708 /*
773 * Make sure that we have exactly size bytes 709 * Make sure that we have exactly size bytes
774 * available to the caller, no more, no less. 710 * available to the caller, no more, no less.
775 */ 711 */
776 skb->avail_size = size; 712 skb_reserve(skb, skb_tailroom(skb) - size);
777 return skb; 713 return skb;
778 } 714 }
779 __kfree_skb(skb); 715 __kfree_skb(skb);
@@ -798,10 +734,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
798 inet_csk(sk)->icsk_ext_hdr_len - 734 inet_csk(sk)->icsk_ext_hdr_len -
799 tp->tcp_header_len); 735 tp->tcp_header_len);
800 736
801 /* TSQ : try to have two TSO segments in flight */
802 xmit_size_goal = min_t(u32, xmit_size_goal,
803 sysctl_tcp_limit_output_bytes >> 1);
804
805 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); 737 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
806 738
807 /* We try hard to avoid divides here */ 739 /* We try hard to avoid divides here */
@@ -811,9 +743,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
811 old_size_goal + mss_now > xmit_size_goal)) { 743 old_size_goal + mss_now > xmit_size_goal)) {
812 xmit_size_goal = old_size_goal; 744 xmit_size_goal = old_size_goal;
813 } else { 745 } else {
814 tp->xmit_size_goal_segs = 746 tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
815 min_t(u16, xmit_size_goal / mss_now,
816 sk->sk_gso_max_segs);
817 xmit_size_goal = tp->xmit_size_goal_segs * mss_now; 747 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
818 } 748 }
819 } 749 }
@@ -831,8 +761,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
831 return mss_now; 761 return mss_now;
832} 762}
833 763
834static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, 764static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
835 size_t size, int flags) 765 size_t psize, int flags)
836{ 766{
837 struct tcp_sock *tp = tcp_sk(sk); 767 struct tcp_sock *tp = tcp_sk(sk);
838 int mss_now, size_goal; 768 int mss_now, size_goal;
@@ -840,15 +770,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
840 ssize_t copied; 770 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 771 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842 772
843 /* Wait for a connection to finish. One exception is TCP Fast Open 773 /* Wait for a connection to finish. */
844 * (passive side) where data is allowed to be sent before a connection 774 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
845 * is fully established.
846 */
847 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
848 !tcp_passive_fastopen(sk)) {
849 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 775 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
850 goto out_err; 776 goto out_err;
851 }
852 777
853 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 778 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
854 779
@@ -859,10 +784,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
859 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 784 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
860 goto out_err; 785 goto out_err;
861 786
862 while (size > 0) { 787 while (psize > 0) {
863 struct sk_buff *skb = tcp_write_queue_tail(sk); 788 struct sk_buff *skb = tcp_write_queue_tail(sk);
864 int copy, i; 789 struct page *page = pages[poffset / PAGE_SIZE];
865 bool can_coalesce; 790 int copy, i, can_coalesce;
791 int offset = poffset % PAGE_SIZE;
792 int size = min_t(size_t, psize, PAGE_SIZE - offset);
866 793
867 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { 794 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
868new_segment: 795new_segment:
@@ -890,7 +817,7 @@ new_segment:
890 goto wait_for_memory; 817 goto wait_for_memory;
891 818
892 if (can_coalesce) { 819 if (can_coalesce) {
893 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 820 skb_shinfo(skb)->frags[i - 1].size += copy;
894 } else { 821 } else {
895 get_page(page); 822 get_page(page);
896 skb_fill_page_desc(skb, i, page, offset, copy); 823 skb_fill_page_desc(skb, i, page, offset, copy);
@@ -907,11 +834,11 @@ new_segment:
907 skb_shinfo(skb)->gso_segs = 0; 834 skb_shinfo(skb)->gso_segs = 0;
908 835
909 if (!copied) 836 if (!copied)
910 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 837 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
911 838
912 copied += copy; 839 copied += copy;
913 offset += copy; 840 poffset += copy;
914 if (!(size -= copy)) 841 if (!(psize -= copy))
915 goto out; 842 goto out;
916 843
917 if (skb->len < size_goal || (flags & MSG_OOB)) 844 if (skb->len < size_goal || (flags & MSG_OOB))
@@ -927,7 +854,8 @@ new_segment:
927wait_for_sndbuf: 854wait_for_sndbuf:
928 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 855 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
929wait_for_memory: 856wait_for_memory:
930 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 857 if (copied)
858 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
931 859
932 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 860 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
933 goto do_error; 861 goto do_error;
@@ -936,7 +864,7 @@ wait_for_memory:
936 } 864 }
937 865
938out: 866out:
939 if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) 867 if (copied)
940 tcp_push(sk, flags, mss_now, tp->nonagle); 868 tcp_push(sk, flags, mss_now, tp->nonagle);
941 return copied; 869 return copied;
942 870
@@ -958,24 +886,24 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
958 flags); 886 flags);
959 887
960 lock_sock(sk); 888 lock_sock(sk);
961 res = do_tcp_sendpages(sk, page, offset, size, flags); 889 res = do_tcp_sendpages(sk, &page, offset, size, flags);
962 release_sock(sk); 890 release_sock(sk);
963 return res; 891 return res;
964} 892}
965EXPORT_SYMBOL(tcp_sendpage); 893EXPORT_SYMBOL(tcp_sendpage);
966 894
967static inline int select_size(const struct sock *sk, bool sg) 895#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
896#define TCP_OFF(sk) (sk->sk_sndmsg_off)
897
898static inline int select_size(struct sock *sk, int sg)
968{ 899{
969 const struct tcp_sock *tp = tcp_sk(sk); 900 struct tcp_sock *tp = tcp_sk(sk);
970 int tmp = tp->mss_cache; 901 int tmp = tp->mss_cache;
971 902
972 if (sg) { 903 if (sg) {
973 if (sk_can_gso(sk)) { 904 if (sk_can_gso(sk))
974 /* Small frames wont use a full page: 905 tmp = 0;
975 * Payload will immediately follow tcp header. 906 else {
976 */
977 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
978 } else {
979 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 907 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
980 908
981 if (tmp >= pgbreak && 909 if (tmp >= pgbreak &&
@@ -987,86 +915,27 @@ static inline int select_size(const struct sock *sk, bool sg)
987 return tmp; 915 return tmp;
988} 916}
989 917
990void tcp_free_fastopen_req(struct tcp_sock *tp)
991{
992 if (tp->fastopen_req != NULL) {
993 kfree(tp->fastopen_req);
994 tp->fastopen_req = NULL;
995 }
996}
997
998static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
999{
1000 struct tcp_sock *tp = tcp_sk(sk);
1001 int err, flags;
1002
1003 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1004 return -EOPNOTSUPP;
1005 if (tp->fastopen_req != NULL)
1006 return -EALREADY; /* Another Fast Open is in progress */
1007
1008 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1009 sk->sk_allocation);
1010 if (unlikely(tp->fastopen_req == NULL))
1011 return -ENOBUFS;
1012 tp->fastopen_req->data = msg;
1013
1014 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1015 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1016 msg->msg_namelen, flags);
1017 *size = tp->fastopen_req->copied;
1018 tcp_free_fastopen_req(tp);
1019 return err;
1020}
1021
1022int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 918int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1023 size_t size) 919 size_t size)
1024{ 920{
1025 struct iovec *iov; 921 struct iovec *iov;
1026 struct tcp_sock *tp = tcp_sk(sk); 922 struct tcp_sock *tp = tcp_sk(sk);
1027 struct sk_buff *skb; 923 struct sk_buff *skb;
1028 int iovlen, flags, err, copied = 0; 924 int iovlen, flags;
1029 int mss_now = 0, size_goal, copied_syn = 0, offset = 0; 925 int mss_now, size_goal;
1030 bool sg; 926 int sg, err, copied;
1031 long timeo; 927 long timeo;
1032 928
1033 lock_sock(sk); 929 lock_sock(sk);
1034 930
1035 flags = msg->msg_flags; 931 flags = msg->msg_flags;
1036 if (flags & MSG_FASTOPEN) {
1037 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1038 if (err == -EINPROGRESS && copied_syn > 0)
1039 goto out;
1040 else if (err)
1041 goto out_err;
1042 offset = copied_syn;
1043 }
1044
1045 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 932 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1046 933
1047 /* Wait for a connection to finish. One exception is TCP Fast Open 934 /* Wait for a connection to finish. */
1048 * (passive side) where data is allowed to be sent before a connection 935 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1049 * is fully established.
1050 */
1051 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1052 !tcp_passive_fastopen(sk)) {
1053 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 936 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1054 goto do_error;
1055 }
1056
1057 if (unlikely(tp->repair)) {
1058 if (tp->repair_queue == TCP_RECV_QUEUE) {
1059 copied = tcp_send_rcvq(sk, msg, size);
1060 goto out;
1061 }
1062
1063 err = -EINVAL;
1064 if (tp->repair_queue == TCP_NO_QUEUE)
1065 goto out_err; 937 goto out_err;
1066 938
1067 /* 'common' sending to sendq */
1068 }
1069
1070 /* This should be in poll */ 939 /* This should be in poll */
1071 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 940 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1072 941
@@ -1081,22 +950,13 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1081 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 950 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1082 goto out_err; 951 goto out_err;
1083 952
1084 sg = !!(sk->sk_route_caps & NETIF_F_SG); 953 sg = sk->sk_route_caps & NETIF_F_SG;
1085 954
1086 while (--iovlen >= 0) { 955 while (--iovlen >= 0) {
1087 size_t seglen = iov->iov_len; 956 size_t seglen = iov->iov_len;
1088 unsigned char __user *from = iov->iov_base; 957 unsigned char __user *from = iov->iov_base;
1089 958
1090 iov++; 959 iov++;
1091 if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
1092 if (offset >= seglen) {
1093 offset -= seglen;
1094 continue;
1095 }
1096 seglen -= offset;
1097 from += offset;
1098 offset = 0;
1099 }
1100 960
1101 while (seglen > 0) { 961 while (seglen > 0) {
1102 int copy = 0; 962 int copy = 0;
@@ -1139,54 +999,86 @@ new_segment:
1139 copy = seglen; 999 copy = seglen;
1140 1000
1141 /* Where to copy to? */ 1001 /* Where to copy to? */
1142 if (skb_availroom(skb) > 0) { 1002 if (skb_tailroom(skb) > 0) {
1143 /* We have some space in skb head. Superb! */ 1003 /* We have some space in skb head. Superb! */
1144 copy = min_t(int, copy, skb_availroom(skb)); 1004 if (copy > skb_tailroom(skb))
1005 copy = skb_tailroom(skb);
1145 err = skb_add_data_nocache(sk, skb, from, copy); 1006 err = skb_add_data_nocache(sk, skb, from, copy);
1146 if (err) 1007 if (err)
1147 goto do_fault; 1008 goto do_fault;
1148 } else { 1009 } else {
1149 bool merge = true; 1010 int merge = 0;
1150 int i = skb_shinfo(skb)->nr_frags; 1011 int i = skb_shinfo(skb)->nr_frags;
1151 struct page_frag *pfrag = sk_page_frag(sk); 1012 struct page *page = TCP_PAGE(sk);
1152 1013 int off = TCP_OFF(sk);
1153 if (!sk_page_frag_refill(sk, pfrag)) 1014
1154 goto wait_for_memory; 1015 if (skb_can_coalesce(skb, i, page, off) &&
1155 1016 off != PAGE_SIZE) {
1156 if (!skb_can_coalesce(skb, i, pfrag->page, 1017 /* We can extend the last page
1157 pfrag->offset)) { 1018 * fragment. */
1158 if (i == MAX_SKB_FRAGS || !sg) { 1019 merge = 1;
1159 tcp_mark_push(tp, skb); 1020 } else if (i == MAX_SKB_FRAGS || !sg) {
1160 goto new_segment; 1021 /* Need to add new fragment and cannot
1022 * do this because interface is non-SG,
1023 * or because all the page slots are
1024 * busy. */
1025 tcp_mark_push(tp, skb);
1026 goto new_segment;
1027 } else if (page) {
1028 if (off == PAGE_SIZE) {
1029 put_page(page);
1030 TCP_PAGE(sk) = page = NULL;
1031 off = 0;
1161 } 1032 }
1162 merge = false; 1033 } else
1163 } 1034 off = 0;
1164 1035
1165 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1036 if (copy > PAGE_SIZE - off)
1037 copy = PAGE_SIZE - off;
1166 1038
1167 if (!sk_wmem_schedule(sk, copy)) 1039 if (!sk_wmem_schedule(sk, copy))
1168 goto wait_for_memory; 1040 goto wait_for_memory;
1169 1041
1042 if (!page) {
1043 /* Allocate new cache page. */
1044 if (!(page = sk_stream_alloc_page(sk)))
1045 goto wait_for_memory;
1046 }
1047
1048 /* Time to copy data. We are close to
1049 * the end! */
1170 err = skb_copy_to_page_nocache(sk, from, skb, 1050 err = skb_copy_to_page_nocache(sk, from, skb,
1171 pfrag->page, 1051 page, off, copy);
1172 pfrag->offset, 1052 if (err) {
1173 copy); 1053 /* If this page was new, give it to the
1174 if (err) 1054 * socket so it does not get leaked.
1055 */
1056 if (!TCP_PAGE(sk)) {
1057 TCP_PAGE(sk) = page;
1058 TCP_OFF(sk) = 0;
1059 }
1175 goto do_error; 1060 goto do_error;
1061 }
1176 1062
1177 /* Update the skb. */ 1063 /* Update the skb. */
1178 if (merge) { 1064 if (merge) {
1179 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1065 skb_shinfo(skb)->frags[i - 1].size +=
1066 copy;
1180 } else { 1067 } else {
1181 skb_fill_page_desc(skb, i, pfrag->page, 1068 skb_fill_page_desc(skb, i, page, off, copy);
1182 pfrag->offset, copy); 1069 if (TCP_PAGE(sk)) {
1183 get_page(pfrag->page); 1070 get_page(page);
1071 } else if (off + copy < PAGE_SIZE) {
1072 get_page(page);
1073 TCP_PAGE(sk) = page;
1074 }
1184 } 1075 }
1185 pfrag->offset += copy; 1076
1077 TCP_OFF(sk) = off + copy;
1186 } 1078 }
1187 1079
1188 if (!copied) 1080 if (!copied)
1189 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 1081 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
1190 1082
1191 tp->write_seq += copy; 1083 tp->write_seq += copy;
1192 TCP_SKB_CB(skb)->end_seq += copy; 1084 TCP_SKB_CB(skb)->end_seq += copy;
@@ -1197,7 +1089,7 @@ new_segment:
1197 if ((seglen -= copy) == 0 && iovlen == 0) 1089 if ((seglen -= copy) == 0 && iovlen == 0)
1198 goto out; 1090 goto out;
1199 1091
1200 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) 1092 if (skb->len < max || (flags & MSG_OOB))
1201 continue; 1093 continue;
1202 1094
1203 if (forced_push(tp)) { 1095 if (forced_push(tp)) {
@@ -1224,7 +1116,10 @@ out:
1224 if (copied) 1116 if (copied)
1225 tcp_push(sk, flags, mss_now, tp->nonagle); 1117 tcp_push(sk, flags, mss_now, tp->nonagle);
1226 release_sock(sk); 1118 release_sock(sk);
1227 return copied + copied_syn; 1119
1120 if (copied > 0)
1121 uid_stat_tcp_snd(current_uid(), copied);
1122 return copied;
1228 1123
1229do_fault: 1124do_fault:
1230 if (!skb->len) { 1125 if (!skb->len) {
@@ -1237,7 +1132,7 @@ do_fault:
1237 } 1132 }
1238 1133
1239do_error: 1134do_error:
1240 if (copied + copied_syn) 1135 if (copied)
1241 goto out; 1136 goto out;
1242out_err: 1137out_err:
1243 err = sk_stream_error(sk, flags, err); 1138 err = sk_stream_error(sk, flags, err);
@@ -1295,24 +1190,6 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1295 return -EAGAIN; 1190 return -EAGAIN;
1296} 1191}
1297 1192
1298static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1299{
1300 struct sk_buff *skb;
1301 int copied = 0, err = 0;
1302
1303 /* XXX -- need to support SO_PEEK_OFF */
1304
1305 skb_queue_walk(&sk->sk_write_queue, skb) {
1306 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1307 if (err)
1308 break;
1309
1310 copied += skb->len;
1311 }
1312
1313 return err ?: copied;
1314}
1315
1316/* Clean up the receive buffer for full frames taken by the user, 1193/* Clean up the receive buffer for full frames taken by the user,
1317 * then send an ACK if necessary. COPIED is the number of bytes 1194 * then send an ACK if necessary. COPIED is the number of bytes
1318 * tcp_recvmsg has given to the user so far, it speeds up the 1195 * tcp_recvmsg has given to the user so far, it speeds up the
@@ -1322,13 +1199,15 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1322void tcp_cleanup_rbuf(struct sock *sk, int copied) 1199void tcp_cleanup_rbuf(struct sock *sk, int copied)
1323{ 1200{
1324 struct tcp_sock *tp = tcp_sk(sk); 1201 struct tcp_sock *tp = tcp_sk(sk);
1325 bool time_to_ack = false; 1202 int time_to_ack = 0;
1326 1203
1204#if TCP_DEBUG
1327 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 1205 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1328 1206
1329 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), 1207 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1330 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", 1208 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1331 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); 1209 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1210#endif
1332 1211
1333 if (inet_csk_ack_scheduled(sk)) { 1212 if (inet_csk_ack_scheduled(sk)) {
1334 const struct inet_connection_sock *icsk = inet_csk(sk); 1213 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1348,7 +1227,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1348 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && 1227 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1349 !icsk->icsk_ack.pingpong)) && 1228 !icsk->icsk_ack.pingpong)) &&
1350 !atomic_read(&sk->sk_rmem_alloc))) 1229 !atomic_read(&sk->sk_rmem_alloc)))
1351 time_to_ack = true; 1230 time_to_ack = 1;
1352 } 1231 }
1353 1232
1354 /* We send an ACK if we can now advertise a non-zero window 1233 /* We send an ACK if we can now advertise a non-zero window
@@ -1370,7 +1249,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1370 * "Lots" means "at least twice" here. 1249 * "Lots" means "at least twice" here.
1371 */ 1250 */
1372 if (new_window && new_window >= 2 * rcv_window_now) 1251 if (new_window && new_window >= 2 * rcv_window_now)
1373 time_to_ack = true; 1252 time_to_ack = 1;
1374 } 1253 }
1375 } 1254 }
1376 if (time_to_ack) 1255 if (time_to_ack)
@@ -1428,12 +1307,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
1428} 1307}
1429#endif 1308#endif
1430 1309
1431static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1310static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1432{ 1311{
1433 struct sk_buff *skb; 1312 struct sk_buff *skb;
1434 u32 offset; 1313 u32 offset;
1435 1314
1436 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 1315 skb_queue_walk(&sk->sk_receive_queue, skb) {
1437 offset = seq - TCP_SKB_CB(skb)->seq; 1316 offset = seq - TCP_SKB_CB(skb)->seq;
1438 if (tcp_hdr(skb)->syn) 1317 if (tcp_hdr(skb)->syn)
1439 offset--; 1318 offset--;
@@ -1441,11 +1320,6 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1441 *off = offset; 1320 *off = offset;
1442 return skb; 1321 return skb;
1443 } 1322 }
1444 /* This looks weird, but this can happen if TCP collapsing
1445 * splitted a fat GRO packet, while we released socket lock
1446 * in skb_splice_bits()
1447 */
1448 sk_eat_skb(sk, skb, false);
1449 } 1323 }
1450 return NULL; 1324 return NULL;
1451} 1325}
@@ -1487,7 +1361,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1487 break; 1361 break;
1488 } 1362 }
1489 used = recv_actor(desc, skb, offset, len); 1363 used = recv_actor(desc, skb, offset, len);
1490 if (used <= 0) { 1364 if (used < 0) {
1491 if (!copied) 1365 if (!copied)
1492 copied = used; 1366 copied = used;
1493 break; 1367 break;
@@ -1496,26 +1370,22 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1496 copied += used; 1370 copied += used;
1497 offset += used; 1371 offset += used;
1498 } 1372 }
1499 /* If recv_actor drops the lock (e.g. TCP splice 1373 /*
1374 * If recv_actor drops the lock (e.g. TCP splice
1500 * receive) the skb pointer might be invalid when 1375 * receive) the skb pointer might be invalid when
1501 * getting here: tcp_collapse might have deleted it 1376 * getting here: tcp_collapse might have deleted it
1502 * while aggregating skbs from the socket queue. 1377 * while aggregating skbs from the socket queue.
1503 */ 1378 */
1504 skb = tcp_recv_skb(sk, seq - 1, &offset); 1379 skb = tcp_recv_skb(sk, seq-1, &offset);
1505 if (!skb) 1380 if (!skb || (offset+1 != skb->len))
1506 break; 1381 break;
1507 /* TCP coalescing might have appended data to the skb.
1508 * Try to splice more frags
1509 */
1510 if (offset + 1 != skb->len)
1511 continue;
1512 } 1382 }
1513 if (tcp_hdr(skb)->fin) { 1383 if (tcp_hdr(skb)->fin) {
1514 sk_eat_skb(sk, skb, false); 1384 sk_eat_skb(sk, skb, 0);
1515 ++seq; 1385 ++seq;
1516 break; 1386 break;
1517 } 1387 }
1518 sk_eat_skb(sk, skb, false); 1388 sk_eat_skb(sk, skb, 0);
1519 if (!desc->count) 1389 if (!desc->count)
1520 break; 1390 break;
1521 tp->copied_seq = seq; 1391 tp->copied_seq = seq;
@@ -1526,9 +1396,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1526 1396
1527 /* Clean up data we have read: This will do ACK frames. */ 1397 /* Clean up data we have read: This will do ACK frames. */
1528 if (copied > 0) { 1398 if (copied > 0) {
1529 tcp_recv_skb(sk, seq, &offset);
1530 tcp_cleanup_rbuf(sk, copied); 1399 tcp_cleanup_rbuf(sk, copied);
1400 uid_stat_tcp_rcv(current_uid(), copied);
1531 } 1401 }
1402
1532 return copied; 1403 return copied;
1533} 1404}
1534EXPORT_SYMBOL(tcp_read_sock); 1405EXPORT_SYMBOL(tcp_read_sock);
@@ -1553,7 +1424,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1553 int target; /* Read at least this many bytes */ 1424 int target; /* Read at least this many bytes */
1554 long timeo; 1425 long timeo;
1555 struct task_struct *user_recv = NULL; 1426 struct task_struct *user_recv = NULL;
1556 bool copied_early = false; 1427 int copied_early = 0;
1557 struct sk_buff *skb; 1428 struct sk_buff *skb;
1558 u32 urg_hole = 0; 1429 u32 urg_hole = 0;
1559 1430
@@ -1569,21 +1440,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1569 if (flags & MSG_OOB) 1440 if (flags & MSG_OOB)
1570 goto recv_urg; 1441 goto recv_urg;
1571 1442
1572 if (unlikely(tp->repair)) {
1573 err = -EPERM;
1574 if (!(flags & MSG_PEEK))
1575 goto out;
1576
1577 if (tp->repair_queue == TCP_SEND_QUEUE)
1578 goto recv_sndq;
1579
1580 err = -EINVAL;
1581 if (tp->repair_queue == TCP_NO_QUEUE)
1582 goto out;
1583
1584 /* 'common' recv queue MSG_PEEK-ing */
1585 }
1586
1587 seq = &tp->copied_seq; 1443 seq = &tp->copied_seq;
1588 if (flags & MSG_PEEK) { 1444 if (flags & MSG_PEEK) {
1589 peek_seq = tp->copied_seq; 1445 peek_seq = tp->copied_seq;
@@ -1604,7 +1460,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1604 if ((available < target) && 1460 if ((available < target) &&
1605 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 1461 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1606 !sysctl_tcp_low_latency && 1462 !sysctl_tcp_low_latency &&
1607 net_dma_find_channel()) { 1463 dma_find_channel(DMA_MEMCPY)) {
1608 preempt_enable_no_resched(); 1464 preempt_enable_no_resched();
1609 tp->ucopy.pinned_list = 1465 tp->ucopy.pinned_list =
1610 dma_pin_iovec_pages(msg->msg_iov, len); 1466 dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1745,14 +1601,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1745 } 1601 }
1746 1602
1747#ifdef CONFIG_NET_DMA 1603#ifdef CONFIG_NET_DMA
1748 if (tp->ucopy.dma_chan) { 1604 if (tp->ucopy.dma_chan)
1749 if (tp->rcv_wnd == 0 && 1605 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1750 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1751 tcp_service_net_dma(sk, true);
1752 tcp_cleanup_rbuf(sk, copied);
1753 } else
1754 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1755 }
1756#endif 1606#endif
1757 if (copied >= target) { 1607 if (copied >= target) {
1758 /* Do not sleep, just process backlog. */ 1608 /* Do not sleep, just process backlog. */
@@ -1791,9 +1641,9 @@ do_prequeue:
1791 } 1641 }
1792 if ((flags & MSG_PEEK) && 1642 if ((flags & MSG_PEEK) &&
1793 (peek_seq - copied - urg_hole != tp->copied_seq)) { 1643 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1794 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", 1644 if (net_ratelimit())
1795 current->comm, 1645 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1796 task_pid_nr(current)); 1646 current->comm, task_pid_nr(current));
1797 peek_seq = tp->copied_seq; 1647 peek_seq = tp->copied_seq;
1798 } 1648 }
1799 continue; 1649 continue;
@@ -1825,7 +1675,7 @@ do_prequeue:
1825 if (!(flags & MSG_TRUNC)) { 1675 if (!(flags & MSG_TRUNC)) {
1826#ifdef CONFIG_NET_DMA 1676#ifdef CONFIG_NET_DMA
1827 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1677 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1828 tp->ucopy.dma_chan = net_dma_find_channel(); 1678 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1829 1679
1830 if (tp->ucopy.dma_chan) { 1680 if (tp->ucopy.dma_chan) {
1831 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( 1681 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1835,8 +1685,7 @@ do_prequeue:
1835 1685
1836 if (tp->ucopy.dma_cookie < 0) { 1686 if (tp->ucopy.dma_cookie < 0) {
1837 1687
1838 pr_alert("%s: dma_cookie < 0\n", 1688 printk(KERN_ALERT "dma_cookie < 0\n");
1839 __func__);
1840 1689
1841 /* Exception. Bailout! */ 1690 /* Exception. Bailout! */
1842 if (!copied) 1691 if (!copied)
@@ -1847,7 +1696,7 @@ do_prequeue:
1847 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); 1696 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1848 1697
1849 if ((offset + used) == skb->len) 1698 if ((offset + used) == skb->len)
1850 copied_early = true; 1699 copied_early = 1;
1851 1700
1852 } else 1701 } else
1853#endif 1702#endif
@@ -1881,7 +1730,7 @@ skip_copy:
1881 goto found_fin_ok; 1730 goto found_fin_ok;
1882 if (!(flags & MSG_PEEK)) { 1731 if (!(flags & MSG_PEEK)) {
1883 sk_eat_skb(sk, skb, copied_early); 1732 sk_eat_skb(sk, skb, copied_early);
1884 copied_early = false; 1733 copied_early = 0;
1885 } 1734 }
1886 continue; 1735 continue;
1887 1736
@@ -1890,7 +1739,7 @@ skip_copy:
1890 ++*seq; 1739 ++*seq;
1891 if (!(flags & MSG_PEEK)) { 1740 if (!(flags & MSG_PEEK)) {
1892 sk_eat_skb(sk, skb, copied_early); 1741 sk_eat_skb(sk, skb, copied_early);
1893 copied_early = false; 1742 copied_early = 0;
1894 } 1743 }
1895 break; 1744 break;
1896 } while (len > 0); 1745 } while (len > 0);
@@ -1932,6 +1781,9 @@ skip_copy:
1932 tcp_cleanup_rbuf(sk, copied); 1781 tcp_cleanup_rbuf(sk, copied);
1933 1782
1934 release_sock(sk); 1783 release_sock(sk);
1784
1785 if (copied > 0)
1786 uid_stat_tcp_rcv(current_uid(), copied);
1935 return copied; 1787 return copied;
1936 1788
1937out: 1789out:
@@ -1940,10 +1792,8 @@ out:
1940 1792
1941recv_urg: 1793recv_urg:
1942 err = tcp_recv_urg(sk, msg, len, flags); 1794 err = tcp_recv_urg(sk, msg, len, flags);
1943 goto out; 1795 if (err > 0)
1944 1796 uid_stat_tcp_rcv(current_uid(), err);
1945recv_sndq:
1946 err = tcp_peek_sndq(sk, msg, len);
1947 goto out; 1797 goto out;
1948} 1798}
1949EXPORT_SYMBOL(tcp_recvmsg); 1799EXPORT_SYMBOL(tcp_recvmsg);
@@ -2041,20 +1891,6 @@ void tcp_shutdown(struct sock *sk, int how)
2041} 1891}
2042EXPORT_SYMBOL(tcp_shutdown); 1892EXPORT_SYMBOL(tcp_shutdown);
2043 1893
2044bool tcp_check_oom(struct sock *sk, int shift)
2045{
2046 bool too_many_orphans, out_of_socket_memory;
2047
2048 too_many_orphans = tcp_too_many_orphans(sk, shift);
2049 out_of_socket_memory = tcp_out_of_memory(sk);
2050
2051 if (too_many_orphans)
2052 net_info_ratelimited("too many orphaned sockets\n");
2053 if (out_of_socket_memory)
2054 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2055 return too_many_orphans || out_of_socket_memory;
2056}
2057
2058void tcp_close(struct sock *sk, long timeout) 1894void tcp_close(struct sock *sk, long timeout)
2059{ 1895{
2060 struct sk_buff *skb; 1896 struct sk_buff *skb;
@@ -2097,9 +1933,7 @@ void tcp_close(struct sock *sk, long timeout)
2097 * advertise a zero window, then kill -9 the FTP client, wheee... 1933 * advertise a zero window, then kill -9 the FTP client, wheee...
2098 * Note: timeout is always zero in such a case. 1934 * Note: timeout is always zero in such a case.
2099 */ 1935 */
2100 if (unlikely(tcp_sk(sk)->repair)) { 1936 if (data_was_unread) {
2101 sk->sk_prot->disconnect(sk, 0);
2102 } else if (data_was_unread) {
2103 /* Unread data was tossed, zap the connection. */ 1937 /* Unread data was tossed, zap the connection. */
2104 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); 1938 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2105 tcp_set_state(sk, TCP_CLOSE); 1939 tcp_set_state(sk, TCP_CLOSE);
@@ -2133,10 +1967,6 @@ void tcp_close(struct sock *sk, long timeout)
2133 * they look as CLOSING or LAST_ACK for Linux) 1967 * they look as CLOSING or LAST_ACK for Linux)
2134 * Probably, I missed some more holelets. 1968 * Probably, I missed some more holelets.
2135 * --ANK 1969 * --ANK
2136 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2137 * in a single packet! (May consider it later but will
2138 * probably need API support or TCP_CORK SYN-ACK until
2139 * data is written and socket is closed.)
2140 */ 1970 */
2141 tcp_send_fin(sk); 1971 tcp_send_fin(sk);
2142 } 1972 }
@@ -2200,7 +2030,10 @@ adjudge_to_death:
2200 } 2030 }
2201 if (sk->sk_state != TCP_CLOSE) { 2031 if (sk->sk_state != TCP_CLOSE) {
2202 sk_mem_reclaim(sk); 2032 sk_mem_reclaim(sk);
2203 if (tcp_check_oom(sk, 0)) { 2033 if (tcp_too_many_orphans(sk, 0)) {
2034 if (net_ratelimit())
2035 printk(KERN_INFO "TCP: too many of orphaned "
2036 "sockets\n");
2204 tcp_set_state(sk, TCP_CLOSE); 2037 tcp_set_state(sk, TCP_CLOSE);
2205 tcp_send_active_reset(sk, GFP_ATOMIC); 2038 tcp_send_active_reset(sk, GFP_ATOMIC);
2206 NET_INC_STATS_BH(sock_net(sk), 2039 NET_INC_STATS_BH(sock_net(sk),
@@ -2208,16 +2041,8 @@ adjudge_to_death:
2208 } 2041 }
2209 } 2042 }
2210 2043
2211 if (sk->sk_state == TCP_CLOSE) { 2044 if (sk->sk_state == TCP_CLOSE)
2212 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2213 /* We could get here with a non-NULL req if the socket is
2214 * aborted (e.g., closed with unread data) before 3WHS
2215 * finishes.
2216 */
2217 if (req != NULL)
2218 reqsk_fastopen_remove(sk, req, false);
2219 inet_csk_destroy_sock(sk); 2045 inet_csk_destroy_sock(sk);
2220 }
2221 /* Otherwise, socket is reprieved until protocol close. */ 2046 /* Otherwise, socket is reprieved until protocol close. */
2222 2047
2223out: 2048out:
@@ -2229,7 +2054,7 @@ EXPORT_SYMBOL(tcp_close);
2229 2054
2230/* These states need RST on ABORT according to RFC793 */ 2055/* These states need RST on ABORT according to RFC793 */
2231 2056
2232static inline bool tcp_need_reset(int state) 2057static inline int tcp_need_reset(int state)
2233{ 2058{
2234 return (1 << state) & 2059 return (1 << state) &
2235 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | 2060 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
@@ -2250,8 +2075,6 @@ int tcp_disconnect(struct sock *sk, int flags)
2250 /* ABORT function of RFC793 */ 2075 /* ABORT function of RFC793 */
2251 if (old_state == TCP_LISTEN) { 2076 if (old_state == TCP_LISTEN) {
2252 inet_csk_listen_stop(sk); 2077 inet_csk_listen_stop(sk);
2253 } else if (unlikely(tp->repair)) {
2254 sk->sk_err = ECONNABORTED;
2255 } else if (tcp_need_reset(old_state) || 2078 } else if (tcp_need_reset(old_state) ||
2256 (tp->snd_nxt != tp->write_seq && 2079 (tp->snd_nxt != tp->write_seq &&
2257 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 2080 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2303,68 +2126,6 @@ int tcp_disconnect(struct sock *sk, int flags)
2303} 2126}
2304EXPORT_SYMBOL(tcp_disconnect); 2127EXPORT_SYMBOL(tcp_disconnect);
2305 2128
2306void tcp_sock_destruct(struct sock *sk)
2307{
2308 inet_sock_destruct(sk);
2309
2310 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2311}
2312
2313static inline bool tcp_can_repair_sock(const struct sock *sk)
2314{
2315 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2316 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2317}
2318
2319static int tcp_repair_options_est(struct tcp_sock *tp,
2320 struct tcp_repair_opt __user *optbuf, unsigned int len)
2321{
2322 struct tcp_repair_opt opt;
2323
2324 while (len >= sizeof(opt)) {
2325 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2326 return -EFAULT;
2327
2328 optbuf++;
2329 len -= sizeof(opt);
2330
2331 switch (opt.opt_code) {
2332 case TCPOPT_MSS:
2333 tp->rx_opt.mss_clamp = opt.opt_val;
2334 break;
2335 case TCPOPT_WINDOW:
2336 {
2337 u16 snd_wscale = opt.opt_val & 0xFFFF;
2338 u16 rcv_wscale = opt.opt_val >> 16;
2339
2340 if (snd_wscale > 14 || rcv_wscale > 14)
2341 return -EFBIG;
2342
2343 tp->rx_opt.snd_wscale = snd_wscale;
2344 tp->rx_opt.rcv_wscale = rcv_wscale;
2345 tp->rx_opt.wscale_ok = 1;
2346 }
2347 break;
2348 case TCPOPT_SACK_PERM:
2349 if (opt.opt_val != 0)
2350 return -EINVAL;
2351
2352 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2353 if (sysctl_tcp_fack)
2354 tcp_enable_fack(tp);
2355 break;
2356 case TCPOPT_TIMESTAMP:
2357 if (opt.opt_val != 0)
2358 return -EINVAL;
2359
2360 tp->rx_opt.tstamp_ok = 1;
2361 break;
2362 }
2363 }
2364
2365 return 0;
2366}
2367
2368/* 2129/*
2369 * Socket option code for TCP. 2130 * Socket option code for TCP.
2370 */ 2131 */
@@ -2535,55 +2296,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2535 err = -EINVAL; 2296 err = -EINVAL;
2536 else 2297 else
2537 tp->thin_dupack = val; 2298 tp->thin_dupack = val;
2538 if (tp->thin_dupack)
2539 tcp_disable_early_retrans(tp);
2540 break;
2541
2542 case TCP_REPAIR:
2543 if (!tcp_can_repair_sock(sk))
2544 err = -EPERM;
2545 else if (val == 1) {
2546 tp->repair = 1;
2547 sk->sk_reuse = SK_FORCE_REUSE;
2548 tp->repair_queue = TCP_NO_QUEUE;
2549 } else if (val == 0) {
2550 tp->repair = 0;
2551 sk->sk_reuse = SK_NO_REUSE;
2552 tcp_send_window_probe(sk);
2553 } else
2554 err = -EINVAL;
2555
2556 break;
2557
2558 case TCP_REPAIR_QUEUE:
2559 if (!tp->repair)
2560 err = -EPERM;
2561 else if (val < TCP_QUEUES_NR)
2562 tp->repair_queue = val;
2563 else
2564 err = -EINVAL;
2565 break;
2566
2567 case TCP_QUEUE_SEQ:
2568 if (sk->sk_state != TCP_CLOSE)
2569 err = -EPERM;
2570 else if (tp->repair_queue == TCP_SEND_QUEUE)
2571 tp->write_seq = val;
2572 else if (tp->repair_queue == TCP_RECV_QUEUE)
2573 tp->rcv_nxt = val;
2574 else
2575 err = -EINVAL;
2576 break;
2577
2578 case TCP_REPAIR_OPTIONS:
2579 if (!tp->repair)
2580 err = -EINVAL;
2581 else if (sk->sk_state == TCP_ESTABLISHED)
2582 err = tcp_repair_options_est(tp,
2583 (struct tcp_repair_opt __user *)optval,
2584 optlen);
2585 else
2586 err = -EPERM;
2587 break; 2299 break;
2588 2300
2589 case TCP_CORK: 2301 case TCP_CORK:
@@ -2698,18 +2410,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2698 /* Cap the max timeout in ms TCP will retry/retrans 2410 /* Cap the max timeout in ms TCP will retry/retrans
2699 * before giving up and aborting (ETIMEDOUT) a connection. 2411 * before giving up and aborting (ETIMEDOUT) a connection.
2700 */ 2412 */
2701 if (val < 0) 2413 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2702 err = -EINVAL;
2703 else
2704 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2705 break;
2706
2707 case TCP_FASTOPEN:
2708 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2709 TCPF_LISTEN)))
2710 err = fastopen_init_queue(sk, val);
2711 else
2712 err = -EINVAL;
2713 break; 2414 break;
2714 default: 2415 default:
2715 err = -ENOPROTOOPT; 2416 err = -ENOPROTOOPT;
@@ -2723,7 +2424,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2723int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, 2424int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2724 unsigned int optlen) 2425 unsigned int optlen)
2725{ 2426{
2726 const struct inet_connection_sock *icsk = inet_csk(sk); 2427 struct inet_connection_sock *icsk = inet_csk(sk);
2727 2428
2728 if (level != SOL_TCP) 2429 if (level != SOL_TCP)
2729 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 2430 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
@@ -2745,9 +2446,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
2745#endif 2446#endif
2746 2447
2747/* Return information about state of tcp endpoint in API format. */ 2448/* Return information about state of tcp endpoint in API format. */
2748void tcp_get_info(const struct sock *sk, struct tcp_info *info) 2449void tcp_get_info(struct sock *sk, struct tcp_info *info)
2749{ 2450{
2750 const struct tcp_sock *tp = tcp_sk(sk); 2451 struct tcp_sock *tp = tcp_sk(sk);
2751 const struct inet_connection_sock *icsk = inet_csk(sk); 2452 const struct inet_connection_sock *icsk = inet_csk(sk);
2752 u32 now = tcp_time_stamp; 2453 u32 now = tcp_time_stamp;
2753 2454
@@ -2769,12 +2470,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2769 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; 2470 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2770 } 2471 }
2771 2472
2772 if (tp->ecn_flags & TCP_ECN_OK) 2473 if (tp->ecn_flags&TCP_ECN_OK)
2773 info->tcpi_options |= TCPI_OPT_ECN; 2474 info->tcpi_options |= TCPI_OPT_ECN;
2774 if (tp->ecn_flags & TCP_ECN_SEEN)
2775 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2776 if (tp->syn_data_acked)
2777 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2778 2475
2779 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); 2476 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2780 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); 2477 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
@@ -2832,8 +2529,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2832 val = tp->mss_cache; 2529 val = tp->mss_cache;
2833 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2530 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2834 val = tp->rx_opt.user_mss; 2531 val = tp->rx_opt.user_mss;
2835 if (tp->repair)
2836 val = tp->rx_opt.mss_clamp;
2837 break; 2532 break;
2838 case TCP_NODELAY: 2533 case TCP_NODELAY:
2839 val = !!(tp->nonagle&TCP_NAGLE_OFF); 2534 val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2936,26 +2631,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2936 val = tp->thin_dupack; 2631 val = tp->thin_dupack;
2937 break; 2632 break;
2938 2633
2939 case TCP_REPAIR:
2940 val = tp->repair;
2941 break;
2942
2943 case TCP_REPAIR_QUEUE:
2944 if (tp->repair)
2945 val = tp->repair_queue;
2946 else
2947 return -EINVAL;
2948 break;
2949
2950 case TCP_QUEUE_SEQ:
2951 if (tp->repair_queue == TCP_SEND_QUEUE)
2952 val = tp->write_seq;
2953 else if (tp->repair_queue == TCP_RECV_QUEUE)
2954 val = tp->rcv_nxt;
2955 else
2956 return -EINVAL;
2957 break;
2958
2959 case TCP_USER_TIMEOUT: 2634 case TCP_USER_TIMEOUT:
2960 val = jiffies_to_msecs(icsk->icsk_user_timeout); 2635 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2961 break; 2636 break;
@@ -2994,12 +2669,11 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2994EXPORT_SYMBOL(compat_tcp_getsockopt); 2669EXPORT_SYMBOL(compat_tcp_getsockopt);
2995#endif 2670#endif
2996 2671
2997struct sk_buff *tcp_tso_segment(struct sk_buff *skb, 2672struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2998 netdev_features_t features)
2999{ 2673{
3000 struct sk_buff *segs = ERR_PTR(-EINVAL); 2674 struct sk_buff *segs = ERR_PTR(-EINVAL);
3001 struct tcphdr *th; 2675 struct tcphdr *th;
3002 unsigned int thlen; 2676 unsigned thlen;
3003 unsigned int seq; 2677 unsigned int seq;
3004 __be32 delta; 2678 __be32 delta;
3005 unsigned int oldlen; 2679 unsigned int oldlen;
@@ -3198,25 +2872,26 @@ EXPORT_SYMBOL(tcp_gro_complete);
3198 2872
3199#ifdef CONFIG_TCP_MD5SIG 2873#ifdef CONFIG_TCP_MD5SIG
3200static unsigned long tcp_md5sig_users; 2874static unsigned long tcp_md5sig_users;
3201static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool; 2875static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
3202static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2876static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
3203 2877
3204static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) 2878static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
3205{ 2879{
3206 int cpu; 2880 int cpu;
3207
3208 for_each_possible_cpu(cpu) { 2881 for_each_possible_cpu(cpu) {
3209 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); 2882 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
3210 2883 if (p) {
3211 if (p->md5_desc.tfm) 2884 if (p->md5_desc.tfm)
3212 crypto_free_hash(p->md5_desc.tfm); 2885 crypto_free_hash(p->md5_desc.tfm);
2886 kfree(p);
2887 }
3213 } 2888 }
3214 free_percpu(pool); 2889 free_percpu(pool);
3215} 2890}
3216 2891
3217void tcp_free_md5sig_pool(void) 2892void tcp_free_md5sig_pool(void)
3218{ 2893{
3219 struct tcp_md5sig_pool __percpu *pool = NULL; 2894 struct tcp_md5sig_pool * __percpu *pool = NULL;
3220 2895
3221 spin_lock_bh(&tcp_md5sig_pool_lock); 2896 spin_lock_bh(&tcp_md5sig_pool_lock);
3222 if (--tcp_md5sig_users == 0) { 2897 if (--tcp_md5sig_users == 0) {
@@ -3229,24 +2904,30 @@ void tcp_free_md5sig_pool(void)
3229} 2904}
3230EXPORT_SYMBOL(tcp_free_md5sig_pool); 2905EXPORT_SYMBOL(tcp_free_md5sig_pool);
3231 2906
3232static struct tcp_md5sig_pool __percpu * 2907static struct tcp_md5sig_pool * __percpu *
3233__tcp_alloc_md5sig_pool(struct sock *sk) 2908__tcp_alloc_md5sig_pool(struct sock *sk)
3234{ 2909{
3235 int cpu; 2910 int cpu;
3236 struct tcp_md5sig_pool __percpu *pool; 2911 struct tcp_md5sig_pool * __percpu *pool;
3237 2912
3238 pool = alloc_percpu(struct tcp_md5sig_pool); 2913 pool = alloc_percpu(struct tcp_md5sig_pool *);
3239 if (!pool) 2914 if (!pool)
3240 return NULL; 2915 return NULL;
3241 2916
3242 for_each_possible_cpu(cpu) { 2917 for_each_possible_cpu(cpu) {
2918 struct tcp_md5sig_pool *p;
3243 struct crypto_hash *hash; 2919 struct crypto_hash *hash;
3244 2920
2921 p = kzalloc(sizeof(*p), sk->sk_allocation);
2922 if (!p)
2923 goto out_free;
2924 *per_cpu_ptr(pool, cpu) = p;
2925
3245 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); 2926 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3246 if (!hash || IS_ERR(hash)) 2927 if (!hash || IS_ERR(hash))
3247 goto out_free; 2928 goto out_free;
3248 2929
3249 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; 2930 p->md5_desc.tfm = hash;
3250 } 2931 }
3251 return pool; 2932 return pool;
3252out_free: 2933out_free:
@@ -3254,16 +2935,16 @@ out_free:
3254 return NULL; 2935 return NULL;
3255} 2936}
3256 2937
3257struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) 2938struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
3258{ 2939{
3259 struct tcp_md5sig_pool __percpu *pool; 2940 struct tcp_md5sig_pool * __percpu *pool;
3260 bool alloc = false; 2941 int alloc = 0;
3261 2942
3262retry: 2943retry:
3263 spin_lock_bh(&tcp_md5sig_pool_lock); 2944 spin_lock_bh(&tcp_md5sig_pool_lock);
3264 pool = tcp_md5sig_pool; 2945 pool = tcp_md5sig_pool;
3265 if (tcp_md5sig_users++ == 0) { 2946 if (tcp_md5sig_users++ == 0) {
3266 alloc = true; 2947 alloc = 1;
3267 spin_unlock_bh(&tcp_md5sig_pool_lock); 2948 spin_unlock_bh(&tcp_md5sig_pool_lock);
3268 } else if (!pool) { 2949 } else if (!pool) {
3269 tcp_md5sig_users--; 2950 tcp_md5sig_users--;
@@ -3275,7 +2956,7 @@ retry:
3275 2956
3276 if (alloc) { 2957 if (alloc) {
3277 /* we cannot hold spinlock here because this may sleep. */ 2958 /* we cannot hold spinlock here because this may sleep. */
3278 struct tcp_md5sig_pool __percpu *p; 2959 struct tcp_md5sig_pool * __percpu *p;
3279 2960
3280 p = __tcp_alloc_md5sig_pool(sk); 2961 p = __tcp_alloc_md5sig_pool(sk);
3281 spin_lock_bh(&tcp_md5sig_pool_lock); 2962 spin_lock_bh(&tcp_md5sig_pool_lock);
@@ -3308,7 +2989,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3308 */ 2989 */
3309struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) 2990struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3310{ 2991{
3311 struct tcp_md5sig_pool __percpu *p; 2992 struct tcp_md5sig_pool * __percpu *p;
3312 2993
3313 local_bh_disable(); 2994 local_bh_disable();
3314 2995
@@ -3319,7 +3000,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3319 spin_unlock(&tcp_md5sig_pool_lock); 3000 spin_unlock(&tcp_md5sig_pool_lock);
3320 3001
3321 if (p) 3002 if (p)
3322 return this_cpu_ptr(p); 3003 return *this_cpu_ptr(p);
3323 3004
3324 local_bh_enable(); 3005 local_bh_enable();
3325 return NULL; 3006 return NULL;
@@ -3334,32 +3015,30 @@ void tcp_put_md5sig_pool(void)
3334EXPORT_SYMBOL(tcp_put_md5sig_pool); 3015EXPORT_SYMBOL(tcp_put_md5sig_pool);
3335 3016
3336int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 3017int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3337 const struct tcphdr *th) 3018 struct tcphdr *th)
3338{ 3019{
3339 struct scatterlist sg; 3020 struct scatterlist sg;
3340 struct tcphdr hdr;
3341 int err; 3021 int err;
3342 3022
3343 /* We are not allowed to change tcphdr, make a local copy */ 3023 __sum16 old_checksum = th->check;
3344 memcpy(&hdr, th, sizeof(hdr)); 3024 th->check = 0;
3345 hdr.check = 0;
3346
3347 /* options aren't included in the hash */ 3025 /* options aren't included in the hash */
3348 sg_init_one(&sg, &hdr, sizeof(hdr)); 3026 sg_init_one(&sg, th, sizeof(struct tcphdr));
3349 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); 3027 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
3028 th->check = old_checksum;
3350 return err; 3029 return err;
3351} 3030}
3352EXPORT_SYMBOL(tcp_md5_hash_header); 3031EXPORT_SYMBOL(tcp_md5_hash_header);
3353 3032
3354int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 3033int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3355 const struct sk_buff *skb, unsigned int header_len) 3034 struct sk_buff *skb, unsigned header_len)
3356{ 3035{
3357 struct scatterlist sg; 3036 struct scatterlist sg;
3358 const struct tcphdr *tp = tcp_hdr(skb); 3037 const struct tcphdr *tp = tcp_hdr(skb);
3359 struct hash_desc *desc = &hp->md5_desc; 3038 struct hash_desc *desc = &hp->md5_desc;
3360 unsigned int i; 3039 unsigned i;
3361 const unsigned int head_data_len = skb_headlen(skb) > header_len ? 3040 const unsigned head_data_len = skb_headlen(skb) > header_len ?
3362 skb_headlen(skb) - header_len : 0; 3041 skb_headlen(skb) - header_len : 0;
3363 const struct skb_shared_info *shi = skb_shinfo(skb); 3042 const struct skb_shared_info *shi = skb_shinfo(skb);
3364 struct sk_buff *frag_iter; 3043 struct sk_buff *frag_iter;
3365 3044
@@ -3371,9 +3050,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3371 3050
3372 for (i = 0; i < shi->nr_frags; ++i) { 3051 for (i = 0; i < shi->nr_frags; ++i) {
3373 const struct skb_frag_struct *f = &shi->frags[i]; 3052 const struct skb_frag_struct *f = &shi->frags[i];
3374 struct page *page = skb_frag_page(f); 3053 sg_set_page(&sg, f->page, f->size, f->page_offset);
3375 sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); 3054 if (crypto_hash_update(desc, &sg, f->size))
3376 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3377 return 1; 3055 return 1;
3378 } 3056 }
3379 3057
@@ -3385,7 +3063,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3385} 3063}
3386EXPORT_SYMBOL(tcp_md5_hash_skb_data); 3064EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3387 3065
3388int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) 3066int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
3389{ 3067{
3390 struct scatterlist sg; 3068 struct scatterlist sg;
3391 3069
@@ -3396,7 +3074,8 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
3396 3074
3397#endif 3075#endif
3398 3076
3399/* Each Responder maintains up to two secret values concurrently for 3077/**
3078 * Each Responder maintains up to two secret values concurrently for
3400 * efficient secret rollover. Each secret value has 4 states: 3079 * efficient secret rollover. Each secret value has 4 states:
3401 * 3080 *
3402 * Generating. (tcp_secret_generating != tcp_secret_primary) 3081 * Generating. (tcp_secret_generating != tcp_secret_primary)
@@ -3526,15 +3205,11 @@ EXPORT_SYMBOL(tcp_cookie_generator);
3526 3205
3527void tcp_done(struct sock *sk) 3206void tcp_done(struct sock *sk)
3528{ 3207{
3529 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3530
3531 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3208 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3532 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 3209 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3533 3210
3534 tcp_set_state(sk, TCP_CLOSE); 3211 tcp_set_state(sk, TCP_CLOSE);
3535 tcp_clear_xmit_timers(sk); 3212 tcp_clear_xmit_timers(sk);
3536 if (req != NULL)
3537 reqsk_fastopen_remove(sk, req, false);
3538 3213
3539 sk->sk_shutdown = SHUTDOWN_MASK; 3214 sk->sk_shutdown = SHUTDOWN_MASK;
3540 3215
@@ -3550,34 +3225,18 @@ extern struct tcp_congestion_ops tcp_reno;
3550static __initdata unsigned long thash_entries; 3225static __initdata unsigned long thash_entries;
3551static int __init set_thash_entries(char *str) 3226static int __init set_thash_entries(char *str)
3552{ 3227{
3553 ssize_t ret;
3554
3555 if (!str) 3228 if (!str)
3556 return 0; 3229 return 0;
3557 3230 thash_entries = simple_strtoul(str, &str, 0);
3558 ret = kstrtoul(str, 0, &thash_entries);
3559 if (ret)
3560 return 0;
3561
3562 return 1; 3231 return 1;
3563} 3232}
3564__setup("thash_entries=", set_thash_entries); 3233__setup("thash_entries=", set_thash_entries);
3565 3234
3566void tcp_init_mem(struct net *net)
3567{
3568 unsigned long limit = nr_free_buffer_pages() / 8;
3569 limit = max(limit, 128UL);
3570 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3571 net->ipv4.sysctl_tcp_mem[1] = limit;
3572 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3573}
3574
3575void __init tcp_init(void) 3235void __init tcp_init(void)
3576{ 3236{
3577 struct sk_buff *skb = NULL; 3237 struct sk_buff *skb = NULL;
3578 unsigned long limit; 3238 unsigned long limit;
3579 int max_rshare, max_wshare, cnt; 3239 int i, max_share, cnt;
3580 unsigned int i;
3581 unsigned long jiffy = jiffies; 3240 unsigned long jiffy = jiffies;
3582 3241
3583 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3242 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3598,11 +3257,11 @@ void __init tcp_init(void)
3598 alloc_large_system_hash("TCP established", 3257 alloc_large_system_hash("TCP established",
3599 sizeof(struct inet_ehash_bucket), 3258 sizeof(struct inet_ehash_bucket),
3600 thash_entries, 3259 thash_entries,
3601 17, /* one slot per 128 KB of memory */ 3260 (totalram_pages >= 128 * 1024) ?
3261 13 : 15,
3602 0, 3262 0,
3603 NULL, 3263 NULL,
3604 &tcp_hashinfo.ehash_mask, 3264 &tcp_hashinfo.ehash_mask,
3605 0,
3606 thash_entries ? 0 : 512 * 1024); 3265 thash_entries ? 0 : 512 * 1024);
3607 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { 3266 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3608 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 3267 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
@@ -3614,13 +3273,13 @@ void __init tcp_init(void)
3614 alloc_large_system_hash("TCP bind", 3273 alloc_large_system_hash("TCP bind",
3615 sizeof(struct inet_bind_hashbucket), 3274 sizeof(struct inet_bind_hashbucket),
3616 tcp_hashinfo.ehash_mask + 1, 3275 tcp_hashinfo.ehash_mask + 1,
3617 17, /* one slot per 128 KB of memory */ 3276 (totalram_pages >= 128 * 1024) ?
3277 13 : 15,
3618 0, 3278 0,
3619 &tcp_hashinfo.bhash_size, 3279 &tcp_hashinfo.bhash_size,
3620 NULL, 3280 NULL,
3621 0,
3622 64 * 1024); 3281 64 * 1024);
3623 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; 3282 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
3624 for (i = 0; i < tcp_hashinfo.bhash_size; i++) { 3283 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3625 spin_lock_init(&tcp_hashinfo.bhash[i].lock); 3284 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3626 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3285 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
@@ -3633,24 +3292,27 @@ void __init tcp_init(void)
3633 sysctl_tcp_max_orphans = cnt / 2; 3292 sysctl_tcp_max_orphans = cnt / 2;
3634 sysctl_max_syn_backlog = max(128, cnt / 256); 3293 sysctl_max_syn_backlog = max(128, cnt / 256);
3635 3294
3636 tcp_init_mem(&init_net); 3295 limit = nr_free_buffer_pages() / 8;
3296 limit = max(limit, 128UL);
3297 sysctl_tcp_mem[0] = limit / 4 * 3;
3298 sysctl_tcp_mem[1] = limit;
3299 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3300
3637 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 3301 /* Set per-socket limits to no more than 1/128 the pressure threshold */
3638 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); 3302 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
3639 max_wshare = min(4UL*1024*1024, limit); 3303 max_share = min(4UL*1024*1024, limit);
3640 max_rshare = min(6UL*1024*1024, limit);
3641 3304
3642 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 3305 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3643 sysctl_tcp_wmem[1] = 16*1024; 3306 sysctl_tcp_wmem[1] = 16*1024;
3644 sysctl_tcp_wmem[2] = max(64*1024, max_wshare); 3307 sysctl_tcp_wmem[2] = max(64*1024, max_share);
3645 3308
3646 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; 3309 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3647 sysctl_tcp_rmem[1] = 87380; 3310 sysctl_tcp_rmem[1] = 87380;
3648 sysctl_tcp_rmem[2] = max(87380, max_rshare); 3311 sysctl_tcp_rmem[2] = max(87380, max_share);
3649 3312
3650 pr_info("Hash tables configured (established %u bind %u)\n", 3313 printk(KERN_INFO "TCP: Hash tables configured "
3651 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3314 "(established %u bind %u)\n",
3652 3315 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3653 tcp_metrics_init();
3654 3316
3655 tcp_register_congestion_control(&tcp_reno); 3317 tcp_register_congestion_control(&tcp_reno);
3656 3318
@@ -3662,5 +3324,108 @@ void __init tcp_init(void)
3662 tcp_secret_primary = &tcp_secret_one; 3324 tcp_secret_primary = &tcp_secret_one;
3663 tcp_secret_retiring = &tcp_secret_two; 3325 tcp_secret_retiring = &tcp_secret_two;
3664 tcp_secret_secondary = &tcp_secret_two; 3326 tcp_secret_secondary = &tcp_secret_two;
3665 tcp_tasklet_init(); 3327}
3328
3329static int tcp_is_local(struct net *net, __be32 addr) {
3330 struct rtable *rt;
3331 struct flowi4 fl4 = { .daddr = addr };
3332 rt = ip_route_output_key(net, &fl4);
3333 if (IS_ERR_OR_NULL(rt))
3334 return 0;
3335 return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK);
3336}
3337
3338#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3339static int tcp_is_local6(struct net *net, struct in6_addr *addr) {
3340 struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0);
3341 return rt6 && rt6->rt6i_dev && (rt6->rt6i_dev->flags & IFF_LOOPBACK);
3342}
3343#endif
3344
3345/*
3346 * tcp_nuke_addr - destroy all sockets on the given local address
3347 * if local address is the unspecified address (0.0.0.0 or ::), destroy all
3348 * sockets with local addresses that are not configured.
3349 */
3350int tcp_nuke_addr(struct net *net, struct sockaddr *addr)
3351{
3352 int family = addr->sa_family;
3353 unsigned int bucket;
3354
3355 struct in_addr *in;
3356#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3357 struct in6_addr *in6;
3358#endif
3359 if (family == AF_INET) {
3360 in = &((struct sockaddr_in *)addr)->sin_addr;
3361#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3362 } else if (family == AF_INET6) {
3363 in6 = &((struct sockaddr_in6 *)addr)->sin6_addr;
3364#endif
3365 } else {
3366 return -EAFNOSUPPORT;
3367 }
3368
3369 for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
3370 struct hlist_nulls_node *node;
3371 struct sock *sk;
3372 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
3373
3374restart:
3375 spin_lock_bh(lock);
3376 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
3377 struct inet_sock *inet = inet_sk(sk);
3378
3379 if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
3380 continue;
3381 if (sock_flag(sk, SOCK_DEAD))
3382 continue;
3383
3384 if (family == AF_INET) {
3385 __be32 s4 = inet->inet_rcv_saddr;
3386 if (s4 == LOOPBACK4_IPV6)
3387 continue;
3388
3389 if (in->s_addr != s4 &&
3390 !(in->s_addr == INADDR_ANY &&
3391 !tcp_is_local(net, s4)))
3392 continue;
3393 }
3394
3395#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3396 if (family == AF_INET6) {
3397 struct in6_addr *s6;
3398 if (!inet->pinet6)
3399 continue;
3400
3401 s6 = &inet->pinet6->rcv_saddr;
3402 if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED)
3403 continue;
3404
3405 if (!ipv6_addr_equal(in6, s6) &&
3406 !(ipv6_addr_equal(in6, &in6addr_any) &&
3407 !tcp_is_local6(net, s6)))
3408 continue;
3409 }
3410#endif
3411
3412 sock_hold(sk);
3413 spin_unlock_bh(lock);
3414
3415 local_bh_disable();
3416 bh_lock_sock(sk);
3417 sk->sk_err = ETIMEDOUT;
3418 sk->sk_error_report(sk);
3419
3420 tcp_done(sk);
3421 bh_unlock_sock(sk);
3422 local_bh_enable();
3423 sock_put(sk);
3424
3425 goto restart;
3426 }
3427 spin_unlock_bh(lock);
3428 }
3429
3430 return 0;
3666} 3431}