From 86911732d3996a9da07914b280621450111bb6da Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 14:19:50 +0000 Subject: gro: Avoid copying headers of unmerged packets Unfortunately simplicity isn't always the best. The fraginfo interface turned out to be suboptimal. The problem was quite obvious. For every packet, we have to copy the headers from the frags structure into skb->head, even though for 99% of the packets this part is immediately thrown away after the merge. LRO didn't have this problem because it directly read the headers from the frags structure. This patch attempts to address this by creating an interface that allows GRO to access the headers in the first frag without having to copy it. Because all drivers that use frags place the headers in the first frag this optimisation should be enough. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cd71b84e483..1cd608253940 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,19 +2481,19 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int mss = 1; int flush = 1; - if (!pskb_may_pull(skb, sizeof(*th))) + th = skb_gro_header(skb, sizeof(*th)); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; - if (!pskb_may_pull(skb, thlen)) + th = skb_gro_header(skb, thlen); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); - __skb_pull(skb, thlen); + skb_gro_pull(skb, thlen); flags = tcp_flag_word(th); @@ -2521,10 +2521,10 @@ found: flush |= th->ack_seq != th2->ack_seq || th->window != th2->window; flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th)); - total = p->len; + total = skb_gro_len(p); mss = skb_shinfo(p)->gso_size; - flush |= skb->len > mss || skb->len <= 0; + flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb); flush |= ntohl(th2->seq) + total != ntohl(th->seq); if (flush || skb_gro_receive(head, skb)) { @@ -2537,7 +2537,7 @@ found: tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: - flush = skb->len < mss; + flush = skb_gro_len(skb) < mss; flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN); -- cgit v1.2.2 From aa6320d336971171df1d13c1c284facf10804881 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 8 Feb 2009 18:00:40 +0000 Subject: gro: Optimise TCP packet reception gro: Optimise TCP packet reception As this function can be called more than half a million times for 10GbE, it's important to optimise it as much as we can. This patch uses bit ops to logical ops, as well as open coding memcmp to exploit alignment properties. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 73266b79c19a..90b2f3c192ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2478,9 +2478,9 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct tcphdr *th2; unsigned int thlen; unsigned int flags; - unsigned int total; unsigned int mss = 1; int flush = 1; + int i; th = skb_gro_header(skb, sizeof(*th)); if (unlikely(!th)) @@ -2504,7 +2504,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) th2 = tcp_hdr(p); - if (th->source != th2->source || th->dest != th2->dest) { + if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -2519,14 +2519,15 @@ found: flush |= flags & TCP_FLAG_CWR; flush |= (flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); - flush |= th->ack_seq != th2->ack_seq || th->window != th2->window; - flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th)); + flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window); + for (i = sizeof(*th); !flush && i < thlen; i += 4) + flush |= *(u32 *)((u8 *)th + i) ^ + *(u32 *)((u8 *)th2 + i); - total = skb_gro_len(p); mss = skb_shinfo(p)->gso_size; - flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb); - flush |= ntohl(th2->seq) + total != ntohl(th->seq); + flush |= (skb_gro_len(skb) > mss) | !skb_gro_len(skb); + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); if (flush || skb_gro_receive(head, skb)) { mss = 1; -- cgit v1.2.2 From 0d6a775e27d975e5f9ea8e2911216d84face50ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sat, 28 Feb 2009 04:44:41 +0000 Subject: tcp: in sendmsg/pages open code the real goto target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit copied was assigned zero right before the goto, so if (copied) cannot ever be true. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90b2f3c192ff..d3f9beee74c0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -683,7 +683,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (psize > 0) { struct sk_buff *skb = tcp_write_queue_tail(sk); @@ -854,7 +854,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (--iovlen >= 0) { int seglen = iov->iov_len; -- cgit v1.2.2 From 0c54b85f2828128274f319a1eb3ce7f604fe2a53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sat, 14 Mar 2009 14:23:05 +0000 Subject: tcp: simplify tcp_current_mss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's very little need for most of the callsites to get tp->xmit_goal_size updated. That will cost us divide as is, so slice the function in two. Also, the only users of the tp->xmit_goal_size are directly behind tcp_current_mss(), so there's no need to store that variable into tcp_sock at all! The drop of xmit_goal_size currently leaves 16-bit hole and some reorganization would again be necessary to change that (but I'm aiming to fill that hole with u16 xmit_goal_size_segs to cache the results of the remaining divide to get that tso on regression). Bring xmit_goal_size parts into tcp.c Signed-off-by: Ilpo Järvinen Cc: Evgeniy Polyakov Cc: Ingo Molnar Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d3f9beee74c0..886596ff0aae 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -661,6 +661,37 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) return NULL; } +static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 xmit_size_goal; + + xmit_size_goal = mss_now; + + if (large_allowed && sk_can_gso(sk)) { + xmit_size_goal = ((sk->sk_gso_max_size - 1) - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); + + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); + xmit_size_goal -= (xmit_size_goal % mss_now); + } + + return xmit_size_goal; +} + +static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +{ + int mss_now; + + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + + return mss_now; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -677,8 +708,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; err = -EPIPE; @@ -761,8 +791,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } out: @@ -844,8 +873,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -1007,8 +1035,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } } -- cgit v1.2.2 From 2a3a041c4e2c1685e668b280c121a5a40a029a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sat, 14 Mar 2009 22:45:16 +0000 Subject: tcp: cache result of earlier divides when mss-aligning things MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The results is very unlikely change every so often so we hardly need to divide again after doing that once for a connection. Yet, if divide still becomes necessary we detect that and do the right thing and again settle for non-divide state. Takes the u16 space which was previously taken by the plain xmit_size_goal. This should take care part of the tso vs non-tso difference we found earlier. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 886596ff0aae..0db9f3b984f7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -665,7 +665,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal; + u32 xmit_size_goal, old_size_goal; xmit_size_goal = mss_now; @@ -676,7 +676,17 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, tp->tcp_header_len); xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - xmit_size_goal -= (xmit_size_goal % mss_now); + + /* We try hard to avoid divides here */ + old_size_goal = tp->xmit_size_goal_segs * mss_now; + + if (likely(old_size_goal <= xmit_size_goal && + old_size_goal + mss_now > xmit_size_goal)) { + xmit_size_goal = old_size_goal; + } else { + tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + xmit_size_goal = tp->xmit_size_goal_segs * mss_now; + } } return xmit_size_goal; -- cgit v1.2.2 From afece1c6587010cc81d1a43045c855774e8234a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sat, 14 Mar 2009 14:23:07 +0000 Subject: tcp: make sure xmit goal size never becomes zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not too likely to happen, would basically require crafted packets (must hit the max guard in tcp_bound_to_half_wnd()). It seems that nothing that bad would happen as there's tcp_mems and congestion window that prevent runaway at some point from hurting all too much (I'm not that sure what all those zero sized segments we would generate do though in write queue). Preventing it regardless is certainly the best way to go. Signed-off-by: Ilpo Järvinen Cc: Evgeniy Polyakov Cc: Ingo Molnar Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0db9f3b984f7..1c4d42ff72bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -689,7 +689,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, } } - return xmit_size_goal; + return max(xmit_size_goal, mss_now); } static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) -- cgit v1.2.2 From beedad923ad6237f03265fdf86eb8a1b50d14ae9 Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Wed, 18 Mar 2009 18:50:09 -0700 Subject: tcp: remove parameter from tcp_recv_urg(). This patch removes an unused parameter (addr_len) from tcp_recv_urg() method in net/ipv4/tcp.c. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1c4d42ff72bd..2451aeb5ac23 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1082,8 +1082,7 @@ out_err: */ static int tcp_recv_urg(struct sock *sk, long timeo, - struct msghdr *msg, int len, int flags, - int *addr_len) + struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); @@ -1698,7 +1697,7 @@ out: return err; recv_urg: - err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + err = tcp_recv_urg(sk, timeo, msg, len, flags); goto out; } -- cgit v1.2.2 From 377f0a08e4cb56658d878d22c3aed4716e283c6b Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Tue, 31 Mar 2009 14:43:17 -0700 Subject: ipv4: remove unused parameter from tcp_recv_urg(). Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2451aeb5ac23..fafbec8b073e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1081,8 +1081,7 @@ out_err: * this, no blocking and very strange errors 8) */ -static int tcp_recv_urg(struct sock *sk, long timeo, - struct msghdr *msg, int len, int flags) +static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); @@ -1697,7 +1696,7 @@ out: return err; recv_urg: - err = tcp_recv_urg(sk, timeo, msg, len, flags); + err = tcp_recv_urg(sk, msg, len, flags); goto out; } -- cgit v1.2.2