From 14869c388673e8db3348ab3706fa6485d0f0cf95 Mon Sep 17 00:00:00 2001
From: Dmitry Yusupov <dima@neterion.com>
Date: Tue, 23 Aug 2005 10:09:27 -0700
Subject: [TCP]: Do TSO deferral even if tail SKB can go out now.

If the tail SKB fits into the window, it is still
benefitical to defer until the goal percentage of
the window is available.  This give the application
time to feed more data into the send queue and thus
results in larger TSO frames going out.

Patch from Dmitry Yusupov <dima@neterion.com>.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 566045e58437..dd30dd137b74 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -925,10 +925,6 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
 
 	limit = min(send_win, cong_win);
 
-	/* If sk_send_head can be sent fully now, just do it.  */
-	if (skb->len <= limit)
-		return 0;
-
 	if (sysctl_tcp_tso_win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
-- 
cgit v1.2.2


From 1344a41637114485fac7afa1505bce2ff862807a Mon Sep 17 00:00:00 2001
From: Dave Johnson <djohnson+linux-kernel@sw.starentnetworks.com>
Date: Tue, 23 Aug 2005 10:10:15 -0700
Subject: [IPV4]: Fix negative timer loop with lots of ipv4 peers.

From: Dave Johnson <djohnson+linux-kernel@sw.starentnetworks.com>

Found this bug while doing some scaling testing that created 500K inet
peers.

peer_check_expire() in net/ipv4/inetpeer.c isn't using inet_peer_gc_mintime
correctly and will end up creating an expire timer with less than the
minimum duration, and even zero/negative if enough active peers are
present.

If >65K peers, the timer will be less than inet_peer_gc_mintime, and with
>70K peers, the timer duration will reach zero and go negative.

The timer handler will continue to schedule another zero/negative timer in
a loop until peers can be aged.  This can continue for at least a few
minutes or even longer if the peers remain active due to arriving packets
while the loop is occurring.

Bug is present in both 2.4 and 2.6.  Same patch will apply to both just
fine.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inetpeer.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 95473953c406..ab18a853d7ce 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -450,10 +450,13 @@ static void peer_check_expire(unsigned long dummy)
 	/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
 	 * interval depending on the total number of entries (more entries,
 	 * less interval). */
-	peer_periodic_timer.expires = jiffies
-		+ inet_peer_gc_maxtime
-		- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
-			peer_total / inet_peer_threshold * HZ;
+	if (peer_total >= inet_peer_threshold)
+		peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
+	else
+		peer_periodic_timer.expires = jiffies
+			+ inet_peer_gc_maxtime
+			- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+				peer_total / inet_peer_threshold * HZ;
 	add_timer(&peer_periodic_timer);
 }
 
-- 
cgit v1.2.2


From 66a79a19a7c582efd99bb143c3a59fbda006eb39 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 23 Aug 2005 10:10:35 -0700
Subject: [NETFILTER]: Fix HW checksum handling in ip_queue/ip6_queue

The checksum needs to be filled in on output, after mangling a packet
ip_summed needs to be reset.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_queue.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba431a4..c6baa8174389 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -214,6 +214,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 		break;
 	
 	case IPQ_COPY_PACKET:
+		if (entry->skb->ip_summed == CHECKSUM_HW &&
+		    (*errp = skb_checksum_help(entry->skb,
+		                               entry->info->outdev == NULL))) {
+			read_unlock_bh(&queue_lock);
+			return NULL;
+		}
 		if (copy_range == 0 || copy_range > entry->skb->len)
 			data_len = entry->skb->len;
 		else
@@ -385,6 +391,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
 	if (!skb_ip_make_writable(&e->skb, v->data_len))
 		return -ENOMEM;
 	memcpy(e->skb->data, v->payload, v->data_len);
+	e->skb->ip_summed = CHECKSUM_NONE;
 	e->skb->nfcache |= NFC_ALTERED;
 
 	/*
-- 
cgit v1.2.2


From 89ebd197eb2cd31d6187db344d5117064e19fdde Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Aug 2005 10:13:06 -0700
Subject: [TCP]: Unconditionally clear TCP_NAGLE_PUSH in skb_entail().

Intention of this bit is to force pushing of the existing
send queue when TCP_CORK or TCP_NODELAY state changes via
setsockopt().

But it's easy to create a situation where the bit never
clears.  For example, if the send queue starts empty:

1) set TCP_NODELAY
2) clear TCP_NODELAY
3) set TCP_CORK
4) do small write()

The current code will leave TCP_NAGLE_PUSH set after that
sequence.  Unconditionally clearing the bit when new data
is added via skb_entail() solves the problem.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ddb6ce4ecff2..69b1fcf70077 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -584,7 +584,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 	sk_charge_skb(sk, skb);
 	if (!sk->sk_send_head)
 		sk->sk_send_head = skb;
-	else if (tp->nonagle&TCP_NAGLE_PUSH)
+	if (tp->nonagle & TCP_NAGLE_PUSH)
 		tp->nonagle &= ~TCP_NAGLE_PUSH; 
 }
 
-- 
cgit v1.2.2


From d5d283751ef3c05b6766501a46800cbee84959d6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Aug 2005 10:49:54 -0700
Subject: [TCP]: Document non-trivial locking path in tcp_v{4,6}_get_port().

This trips up a lot of folks reading this code.
Put an unlikely() around the port-exhaustion test
for good measure.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5d91213d34c0..67c670886c1f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -242,9 +242,14 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);
 
-		/* Exhausted local port range during search? */
+		/* Exhausted local port range during search?  It is not
+		 * possible for us to be holding one of the bind hash
+		 * locks if this test triggers, because if 'remaining'
+		 * drops to zero, we broke out of the do/while loop at
+		 * the top level, not from the 'break;' statement.
+		 */
 		ret = 1;
-		if (remaining <= 0)
+		if (unlikely(remaining <= 0))
 			goto fail;
 
 		/* OK, here is the one we will use.  HEAD is
-- 
cgit v1.2.2