From 5db92c994982ed826cf38f38d58bd09bc326aef6 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <octavian.purdila@intel.com>
Date: Wed, 25 Jun 2014 17:09:59 +0300
Subject: tcp: unify tcp_v4_rtx_synack and tcp_v6_rtx_synack

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d92bce0ea24e..f8f2a944a1ce 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3299,3 +3299,18 @@ void tcp_send_probe0(struct sock *sk)
 					  TCP_RTO_MAX);
 	}
 }
+
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+{
+	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+	struct flowi fl;
+	int res;
+
+	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+	if (!res) {
+		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+	}
+	return res;
+}
+EXPORT_SYMBOL(tcp_rtx_synack);
-- 
cgit v1.2.2


From b73c3d0e4f0e1961e15bec18720e48aabebe2109 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Tue, 1 Jul 2014 21:32:17 -0700
Subject: net: Save TX flow hash in sock and set in skbuf on xmit

For a connected socket we can precompute the flow hash for setting
in skb->hash on output. This is a performance advantage over
calculating the skb->hash for every packet on the connection. The
computation is done using the common hash algorithm to be consistent
with computations done for packets of the connection in other states
where thers is no socket (e.g. time-wait, syn-recv, syn-cookies).

This patch adds sk_txhash to the sock structure. inet_set_txhash and
ip6_set_txhash functions are added which are called from points in
TCP and UDP where socket moves to established state.

skb_set_hash_from_sk is a function which sets skb->hash from the
sock txhash value. This is called in UDP and TCP transmit path when
transmitting within the context of a socket.

Tested: ran super_netperf with 200 TCP_RR streams over a vxlan
interface (in this case skb_get_hash called on every TX packet to
create a UDP source port).

Before fix:

  95.02% CPU utilization
  154/256/505 90/95/99% latencies
  1.13042e+06 tps

  Time in functions:
    0.28% skb_flow_dissect
    0.21% __skb_get_hash

After fix:

  94.95% CPU utilization
  156/254/485 90/95/99% latencies
  1.15447e+06

  Neither __skb_get_hash nor skb_flow_dissect appear in perf

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f8f2a944a1ce..bcee13c4627c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	skb_orphan(skb);
 	skb->sk = sk;
 	skb->destructor = tcp_wfree;
+	skb_set_hash_from_sk(skb, sk);
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
 	/* Build TCP header and checksum it. */
-- 
cgit v1.2.2


From 5ee2c941b5969eb1b5592f9731b3ee76a784641f Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Mon, 14 Jul 2014 16:58:32 +0200
Subject: tcp: Remove unnecessary arg from tcp_enter_cwr and
 tcp_init_cwnd_reduction

Since Yuchung's 9b44190dc11 (tcp: refactor F-RTO), tcp_enter_cwr is always
called with set_ssthresh = 1. Thus, we can remove this argument from
tcp_enter_cwr. Further, as we remove this one, tcp_init_cwnd_reduction
is then always called with set_ssthresh = true, and so we can get rid of
this argument as well.

Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bcee13c4627c..306dd5dead7d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -979,7 +979,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (likely(err <= 0))
 		return err;
 
-	tcp_enter_cwr(sk, 1);
+	tcp_enter_cwr(sk);
 
 	return net_xmit_eval(err);
 }
-- 
cgit v1.2.2


From 490cc7d03c21871eef5949ba77a18b0c7ef70ef5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 12 Aug 2014 15:08:12 -0400
Subject: net-timestamp: fix missing tcp fragmentation cases

Bytestream timestamps are correlated with a single byte in the skbuff,
recorded in skb_shinfo(skb)->tskey. When fragmenting skbuffs, ensure
that the tskey is set for the fragment in which the tskey falls
(seqno <= tskey < end_seqno).

The original implementation did not address fragmentation in
tcp_fragment or tso_fragment. Add code to inspect the sequence numbers
and move both tskey and the relevant tx_flags if necessary.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8fcfc91964ec..ef4a051de018 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1069,6 +1069,21 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 	tcp_verify_left_out(tp);
 }
 
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+		shinfo->tx_flags &= ~tsflags;
+		shinfo2->tx_flags |= tsflags;
+		swap(shinfo->tskey, shinfo2->tskey);
+	}
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
@@ -1136,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
 	buff->tstamp = skb->tstamp;
+	tcp_fragment_tstamp(skb, buff);
 
 	old_factor = tcp_skb_pcount(skb);
 
@@ -1652,6 +1668,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
 	skb_split(skb, buff, len);
+	tcp_fragment_tstamp(skb, buff);
 
 	/* Fix up tso_factor for both original and new SKB.  */
 	tcp_set_skb_tso_segs(sk, skb, mss_now);
-- 
cgit v1.2.2


From 9d186cac7ffb1831e9f34cb4a3a8b22abb9dd9d4 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Wed, 13 Aug 2014 16:03:10 +0400
Subject: tcp: don't use timestamp from repaired skb-s to calculate RTT (v2)

We don't know right timestamp for repaired skb-s. Wrong RTT estimations
isn't good, because some congestion modules heavily depends on it.

This patch adds the TCPCB_REPAIRED flag, which is included in
TCPCB_RETRANS.

Thanks to Eric for the advice how to fix this issue.

This patch fixes the warning:
[  879.562947] WARNING: CPU: 0 PID: 2825 at net/ipv4/tcp_input.c:3078 tcp_ack+0x11f5/0x1380()
[  879.567253] CPU: 0 PID: 2825 Comm: socket-tcpbuf-l Not tainted 3.16.0-next-20140811 #1
[  879.567829] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  879.568177]  0000000000000000 00000000c532680c ffff880039643d00 ffffffff817aa2d2
[  879.568776]  0000000000000000 ffff880039643d38 ffffffff8109afbd ffff880039d6ba80
[  879.569386]  ffff88003a449800 000000002983d6bd 0000000000000000 000000002983d6bc
[  879.569982] Call Trace:
[  879.570264]  [<ffffffff817aa2d2>] dump_stack+0x4d/0x66
[  879.570599]  [<ffffffff8109afbd>] warn_slowpath_common+0x7d/0xa0
[  879.570935]  [<ffffffff8109b0ea>] warn_slowpath_null+0x1a/0x20
[  879.571292]  [<ffffffff816d0a05>] tcp_ack+0x11f5/0x1380
[  879.571614]  [<ffffffff816d10bd>] tcp_rcv_established+0x1ed/0x710
[  879.571958]  [<ffffffff816dc9da>] tcp_v4_do_rcv+0x10a/0x370
[  879.572315]  [<ffffffff81657459>] release_sock+0x89/0x1d0
[  879.572642]  [<ffffffff816c81a0>] do_tcp_setsockopt.isra.36+0x120/0x860
[  879.573000]  [<ffffffff8110a52e>] ? rcu_read_lock_held+0x6e/0x80
[  879.573352]  [<ffffffff816c8912>] tcp_setsockopt+0x32/0x40
[  879.573678]  [<ffffffff81654ac4>] sock_common_setsockopt+0x14/0x20
[  879.574031]  [<ffffffff816537b0>] SyS_setsockopt+0x80/0xf0
[  879.574393]  [<ffffffff817b40a9>] system_call_fastpath+0x16/0x1b
[  879.574730] ---[ end trace a17cbc38eb8c5c00 ]---

v2: moving setting of skb->when for repaired skb-s in tcp_write_xmit,
    where it's set for other skb-s.

Fixes: 431a91242d8d ("tcp: timestamp SYN+DATA messages")
Fixes: 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution")
Cc: Eric Dumazet <edumazet@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ef4a051de018..ff3f0f75cc6c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1934,8 +1934,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
-		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
+		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+			/* "when" is used as a start point for the retransmit timer */
+			TCP_SKB_CB(skb)->when = tcp_time_stamp;
 			goto repair; /* Skip network transmission */
+		}
 
 		cwnd_quota = tcp_cwnd_test(tp, skb);
 		if (!cwnd_quota) {
-- 
cgit v1.2.2


From 4fab9071950c2021d846e18351e0f46a1cffd67b Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 14 Aug 2014 12:40:05 -0400
Subject: tcp: fix tcp_release_cb() to dispatch via address family for
 mtu_reduced()

Make sure we use the correct address-family-specific function for
handling MTU reductions from within tcp_release_cb().

Previously AF_INET6 sockets were incorrectly always using the IPv6
code path when sometimes they were handling IPv4 traffic and thus had
an IPv4 dst.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Diagnosed-by: Willem de Bruijn <willemb@google.com>
Fixes: 563d34d057862 ("tcp: dont drop MTU reduction indications")
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4/tcp_output.c')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ff3f0f75cc6c..5a7c41fbc6d3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -800,7 +800,7 @@ void tcp_release_cb(struct sock *sk)
 		__sock_put(sk);
 	}
 	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
-		sk->sk_prot->mtu_reduced(sk);
+		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
 }
-- 
cgit v1.2.2