From a21090cff296eb82b6de09304d64de466bdcaefc Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 7 Oct 2009 03:18:17 -0700
Subject: ipv4: arp_notify address list bug

This fixes a bug with arp_notify.

If arp_notify is enabled, kernel will crash if address is changed
and no IP address is assigned.
  http://bugzilla.kernel.org/show_bug.cgi?id=14330

Reported-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e92f1fd28aa5..5df2f6a0b0f0 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1077,12 +1077,16 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 		ip_mc_up(in_dev);
 		/* fall through */
 	case NETDEV_CHANGEADDR:
-		if (IN_DEV_ARP_NOTIFY(in_dev))
-			arp_send(ARPOP_REQUEST, ETH_P_ARP,
-				 in_dev->ifa_list->ifa_address,
-				 dev,
-				 in_dev->ifa_list->ifa_address,
-				 NULL, dev->dev_addr, NULL);
+		/* Send gratuitous ARP to notify of link change */
+		if (IN_DEV_ARP_NOTIFY(in_dev)) {
+			struct in_ifaddr *ifa = in_dev->ifa_list;
+
+			if (ifa)
+				arp_send(ARPOP_REQUEST, ETH_P_ARP,
+					 ifa->ifa_address, dev,
+					 ifa->ifa_address, NULL,
+					 dev->dev_addr, NULL);
+		}
 		break;
 	case NETDEV_DOWN:
 		ip_mc_down(in_dev);
-- 
cgit v1.2.2


From 6d01a026b7d3009a418326bdcf313503a314f1ea Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 13 Oct 2009 00:27:40 -0700
Subject: tcp: fix tcp_defer_accept to consider the timeout

I was trying to use TCP_DEFER_ACCEPT and noticed that if the
client does not talk, the connection is never accepted and
remains in SYN_RECV state until the retransmits expire, where
it finally is deleted. This is bad when some firewall such as
netfilter sits between the client and the server because the
firewall sees the connection in ESTABLISHED state while the
server will finally silently drop it without sending an RST.

This behaviour contradicts the man page which says it should
wait only for some time :

       TCP_DEFER_ACCEPT (since Linux 2.4)
          Allows a listener to be awakened only when data arrives
          on the socket.  Takes an integer value  (seconds), this
          can  bound  the  maximum  number  of attempts TCP will
          make to complete the connection. This option should not
          be used in code intended to be portable.

Also, looking at ipv4/tcp.c, a retransmit counter is correctly
computed :

        case TCP_DEFER_ACCEPT:
                icsk->icsk_accept_queue.rskq_defer_accept = 0;
                if (val > 0) {
                        /* Translate value in seconds to number of
                         * retransmits */
                        while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
                               val > ((TCP_TIMEOUT_INIT / HZ) <<
                                       icsk->icsk_accept_queue.rskq_defer_accept))
                                icsk->icsk_accept_queue.rskq_defer_accept++;
                        icsk->icsk_accept_queue.rskq_defer_accept++;
                }
                break;

==> rskq_defer_accept is used as a counter of retransmits.

But in tcp_minisocks.c, this counter is only checked. And in
fact, I have found no location which updates it. So I think
that what was intended was to decrease it in tcp_minisocks
whenever it is checked, which the trivial patch below does.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 624c3c9b3c2b..e320afea07fc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -644,6 +644,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
 	if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
 	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+		inet_csk(sk)->icsk_accept_queue.rskq_defer_accept--;
 		inet_rsk(req)->acked = 1;
 		return NULL;
 	}
-- 
cgit v1.2.2


From 85584672012ee0c3b7b8e033a1ecf7c11878e45f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 9 Oct 2009 04:43:40 +0000
Subject: udp: Fix udp_poll() and ioctl()

udp_poll() can in some circumstances drop frames with incorrect checksums.

Problem is we now have to lock the socket while dropping frames, or risk
sk_forward corruption.

This bug is present since commit 95766fff6b9a78d1
([UDP]: Add memory accounting.)

While we are at it, we can correct ioctl(SIOCINQ) to also drop bad frames.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 73 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6ec6a8a4a224..d0d436d6216c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -841,6 +841,42 @@ out:
 	return ret;
 }
 
+
+/**
+ *	first_packet_length	- return length of first packet in receive queue
+ *	@sk: socket
+ *
+ *	Drops all bad checksum frames, until a valid one is found.
+ *	Returns the length of found skb, or 0 if none is found.
+ */
+static unsigned int first_packet_length(struct sock *sk)
+{
+	struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+	struct sk_buff *skb;
+	unsigned int res;
+
+	__skb_queue_head_init(&list_kill);
+
+	spin_lock_bh(&rcvq->lock);
+	while ((skb = skb_peek(rcvq)) != NULL &&
+		udp_lib_checksum_complete(skb)) {
+		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+				 IS_UDPLITE(sk));
+		__skb_unlink(skb, rcvq);
+		__skb_queue_tail(&list_kill, skb);
+	}
+	res = skb ? skb->len : 0;
+	spin_unlock_bh(&rcvq->lock);
+
+	if (!skb_queue_empty(&list_kill)) {
+		lock_sock(sk);
+		__skb_queue_purge(&list_kill);
+		sk_mem_reclaim_partial(sk);
+		release_sock(sk);
+	}
+	return res;
+}
+
 /*
  *	IOCTL requests applicable to the UDP protocol
  */
@@ -857,21 +893,16 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 
 	case SIOCINQ:
 	{
-		struct sk_buff *skb;
-		unsigned long amount;
+		unsigned int amount = first_packet_length(sk);
 
-		amount = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		skb = skb_peek(&sk->sk_receive_queue);
-		if (skb != NULL) {
+		if (amount)
 			/*
 			 * We will only return the amount
 			 * of this packet since that is all
 			 * that will be read.
 			 */
-			amount = skb->len - sizeof(struct udphdr);
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
+			amount -= sizeof(struct udphdr);
+
 		return put_user(amount, (int __user *)arg);
 	}
 
@@ -1540,29 +1571,11 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
 	unsigned int mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
-	int 	is_lite = IS_UDPLITE(sk);
 
 	/* Check for false positives due to checksum errors */
-	if ((mask & POLLRDNORM) &&
-	    !(file->f_flags & O_NONBLOCK) &&
-	    !(sk->sk_shutdown & RCV_SHUTDOWN)) {
-		struct sk_buff_head *rcvq = &sk->sk_receive_queue;
-		struct sk_buff *skb;
-
-		spin_lock_bh(&rcvq->lock);
-		while ((skb = skb_peek(rcvq)) != NULL &&
-		       udp_lib_checksum_complete(skb)) {
-			UDP_INC_STATS_BH(sock_net(sk),
-					UDP_MIB_INERRORS, is_lite);
-			__skb_unlink(skb, rcvq);
-			kfree_skb(skb);
-		}
-		spin_unlock_bh(&rcvq->lock);
-
-		/* nothing to see, move along */
-		if (skb == NULL)
-			mask &= ~(POLLIN | POLLRDNORM);
-	}
+	if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+	    !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+		mask &= ~(POLLIN | POLLRDNORM);
 
 	return mask;
 
-- 
cgit v1.2.2


From a1a2ad9151c26d92e5c733a33d52108f5d3a5b57 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 19 Oct 2009 19:12:36 -0700
Subject: Revert "tcp: fix tcp_defer_accept to consider the timeout"

This reverts commit 6d01a026b7d3009a418326bdcf313503a314f1ea.

Julian Anastasov, Willy Tarreau and Eric Dumazet have come up
with a more correct way to deal with this.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e320afea07fc..624c3c9b3c2b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -644,7 +644,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
 	if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
 	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
-		inet_csk(sk)->icsk_accept_queue.rskq_defer_accept--;
 		inet_rsk(req)->acked = 1;
 		return NULL;
 	}
-- 
cgit v1.2.2


From d1b99ba41d6c5aa1ed2fc634323449dd656899e9 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 19 Oct 2009 10:01:56 +0000
Subject: tcp: accept socket after TCP_DEFER_ACCEPT period

Willy Tarreau and many other folks in recent years
were concerned what happens when the TCP_DEFER_ACCEPT period
expires for clients which sent ACK packet. They prefer clients
that actively resend ACK on our SYN-ACK retransmissions to be
converted from open requests to sockets and queued to the
listener for accepting after the deferring period is finished.
Then application server can decide to wait longer for data
or to properly terminate the connection with FIN if read()
returns EAGAIN which is an indication for accepting after
the deferring period. This change still can have side effects
for applications that expect always to see data on the accepted
socket. Others can be prepared to work in both modes (with or
without TCP_DEFER_ACCEPT period) and their data processing can
ignore the read=EAGAIN notification and to allocate resources for
clients which proved to have no data to send during the deferring
period. OTOH, servers that use TCP_DEFER_ACCEPT=1 as flag (not
as a timeout) to wait for data will notice clients that didn't
send data for 3 seconds but that still resend ACKs.
Thanks to Willy Tarreau for the initial idea and to
Eric Dumazet for the review and testing the change.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 624c3c9b3c2b..4c03598ed924 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -641,8 +641,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (!(flg & TCP_FLAG_ACK))
 		return NULL;
 
-	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
-	if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
+	if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
 	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 		inet_rsk(req)->acked = 1;
 		return NULL;
-- 
cgit v1.2.2


From 0c3d79bce48034018e840468ac5a642894a521a3 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 19 Oct 2009 10:03:58 +0000
Subject: tcp: reduce SYN-ACK retrans for TCP_DEFER_ACCEPT

Change SYN-ACK retransmitting code for the TCP_DEFER_ACCEPT
users to not retransmit SYN-ACKs during the deferring period if
ACK from client was received. The goal is to reduce traffic
during the deferring period. When the period is finished
we continue with sending SYN-ACKs (at least one) but this time
any traffic from client will change the request to established
socket allowing application to terminate it properly.
Also, do not drop acked request if sending of SYN-ACK fails.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4351ca2cf0b8..537731b3bcb3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -446,6 +446,28 @@ extern int sysctl_tcp_synack_retries;
 
 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
 
+/* Decide when to expire the request and when to resend SYN-ACK */
+static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
+				  const int max_retries,
+				  const u8 rskq_defer_accept,
+				  int *expire, int *resend)
+{
+	if (!rskq_defer_accept) {
+		*expire = req->retrans >= thresh;
+		*resend = 1;
+		return;
+	}
+	*expire = req->retrans >= thresh &&
+		  (!inet_rsk(req)->acked || req->retrans >= max_retries);
+	/*
+	 * Do not resend while waiting for data after ACK,
+	 * start to resend on end of deferring period to give
+	 * last chance for data or ACK to create established socket.
+	 */
+	*resend = !inet_rsk(req)->acked ||
+		  req->retrans >= rskq_defer_accept - 1;
+}
+
 void inet_csk_reqsk_queue_prune(struct sock *parent,
 				const unsigned long interval,
 				const unsigned long timeout,
@@ -501,9 +523,15 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 		reqp=&lopt->syn_table[i];
 		while ((req = *reqp) != NULL) {
 			if (time_after_eq(now, req->expires)) {
-				if ((req->retrans < thresh ||
-				     (inet_rsk(req)->acked && req->retrans < max_retries))
-				    && !req->rsk_ops->rtx_syn_ack(parent, req)) {
+				int expire = 0, resend = 0;
+
+				syn_ack_recalc(req, thresh, max_retries,
+					       queue->rskq_defer_accept,
+					       &expire, &resend);
+				if (!expire &&
+				    (!resend ||
+				     !req->rsk_ops->rtx_syn_ack(parent, req) ||
+				     inet_rsk(req)->acked)) {
 					unsigned long timeo;
 
 					if (req->retrans++ == 0)
-- 
cgit v1.2.2


From b103cf34382f26ff48a87931b83f13b177b47c1a Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 19 Oct 2009 10:10:40 +0000
Subject: tcp: fix TCP_DEFER_ACCEPT retrans calculation

Fix TCP_DEFER_ACCEPT conversion between seconds and
retransmission to match the TCP SYN-ACK retransmission periods
because the time is converted to such retransmissions. The old
algorithm selects one more retransmission in some cases. Allow
up to 255 retransmissions.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 64d0af675823..9b2756fbdf9b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -326,6 +326,43 @@ void tcp_enter_memory_pressure(struct sock *sk)
 
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 
+/* Convert seconds to retransmits based on initial and max timeout */
+static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
+{
+	u8 res = 0;
+
+	if (seconds > 0) {
+		int period = timeout;
+
+		res = 1;
+		while (seconds > period && res < 255) {
+			res++;
+			timeout <<= 1;
+			if (timeout > rto_max)
+				timeout = rto_max;
+			period += timeout;
+		}
+	}
+	return res;
+}
+
+/* Convert retransmits to seconds based on initial and max timeout */
+static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+{
+	int period = 0;
+
+	if (retrans > 0) {
+		period = timeout;
+		while (--retrans) {
+			timeout <<= 1;
+			if (timeout > rto_max)
+				timeout = rto_max;
+			period += timeout;
+		}
+	}
+	return period;
+}
+
 /*
  *	Wait for a TCP event.
  *
@@ -2163,16 +2200,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_DEFER_ACCEPT:
-		icsk->icsk_accept_queue.rskq_defer_accept = 0;
-		if (val > 0) {
-			/* Translate value in seconds to number of
-			 * retransmits */
-			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
-			       val > ((TCP_TIMEOUT_INIT / HZ) <<
-				       icsk->icsk_accept_queue.rskq_defer_accept))
-				icsk->icsk_accept_queue.rskq_defer_accept++;
-			icsk->icsk_accept_queue.rskq_defer_accept++;
-		}
+		/* Translate value in seconds to number of retransmits */
+		icsk->icsk_accept_queue.rskq_defer_accept =
+			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+					TCP_RTO_MAX / HZ);
 		break;
 
 	case TCP_WINDOW_CLAMP:
@@ -2353,8 +2384,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
 		break;
 	case TCP_DEFER_ACCEPT:
-		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
-			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
+		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
+				      TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
 		break;
 	case TCP_WINDOW_CLAMP:
 		val = tp->window_clamp;
-- 
cgit v1.2.2


From 55b8050353c4a212c94d7156e2bd5885225b869b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 19 Oct 2009 06:41:58 +0000
Subject: net: Fix IP_MULTICAST_IF

ipv4/ipv6 setsockopt(IP_MULTICAST_IF) have dubious __dev_get_by_index() calls.

This function should be called only with RTNL or dev_base_lock held, or reader
could see a corrupt hash chain and eventually enter an endless loop.

Fix is to call dev_get_by_index()/dev_put().

If this happens to be performance critical, we could define a new dev_exist_by_index()
function to avoid touching dev refcount.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0c0b6e363a20..e982b5c1ee17 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -634,17 +634,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 				break;
 			}
 			dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
-			if (dev) {
+			if (dev)
 				mreq.imr_ifindex = dev->ifindex;
-				dev_put(dev);
-			}
 		} else
-			dev = __dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
+			dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
 
 
 		err = -EADDRNOTAVAIL;
 		if (!dev)
 			break;
+		dev_put(dev);
 
 		err = -EINVAL;
 		if (sk->sk_bound_dev_if &&
-- 
cgit v1.2.2


From b6b39e8f3fbbb31001b836afec87bcaf4811a7bf Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 19 Oct 2009 19:41:06 +0000
Subject: tcp: Try to catch MSG_PEEK bug

This patch tries to print out more information when we hit the
MSG_PEEK bug in tcp_recvmsg.  It's been around since at least
2005 and it's about time that we finally fix it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9b2756fbdf9b..90b2e0649bfb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1442,7 +1442,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				goto found_ok_skb;
 			if (tcp_hdr(skb)->fin)
 				goto found_fin_ok;
-			WARN_ON(!(flags & MSG_PEEK));
+			if (WARN_ON(!(flags & MSG_PEEK)))
+				printk(KERN_INFO "recvmsg bug 2: copied %X "
+				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
 		}
 
 		/* Well, if we have backlog, try to process it now yet. */
-- 
cgit v1.2.2


From c62f4c453ab4b0240ab857bfd089da2c01ad91e7 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 22 Oct 2009 21:37:56 -0700
Subject: net: use WARN() for the WARN_ON in commit b6b39e8f3fbbb

Commit b6b39e8f3fbbb (tcp: Try to catch MSG_PEEK bug) added a printk()
to the WARN_ON() that's in tcp.c. This patch changes this combination
to WARN(); the advantage of WARN() is that the printk message shows up
inside the message, so that kerneloops.org will collect the message.

In addition, this gets rid of an extra if() statement.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 90b2e0649bfb..98440ad82558 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1442,9 +1442,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				goto found_ok_skb;
 			if (tcp_hdr(skb)->fin)
 				goto found_fin_ok;
-			if (WARN_ON(!(flags & MSG_PEEK)))
-				printk(KERN_INFO "recvmsg bug 2: copied %X "
-				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+			WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
+					"copied %X seq %X\n", *seq,
+					TCP_SKB_CB(skb)->seq);
 		}
 
 		/* Well, if we have backlog, try to process it now yet. */
-- 
cgit v1.2.2


From 55888dfb6ba7e318bb3d6a44d25009906206bf6a Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 28 Oct 2009 08:59:47 +0000
Subject: AF_RAW: Augment raw_send_hdrinc to expand skb to fit iphdr->ihl (v2)

Augment raw_send_hdrinc to correct for incorrect ip header length values

A series of oopses was reported to me recently.  Apparently when using AF_RAW
sockets to send data to peers that were reachable via ipsec encapsulation,
people could panic or BUG halt their systems.

I've tracked the problem down to user space sending an invalid ip header over an
AF_RAW socket with IP_HDRINCL set to 1.

Basically what happens is that userspace sends down an ip frame that includes
only the header (no data), but sets the ip header ihl value to a large number,
one that is larger than the total amount of data passed to the sendmsg call.  In
raw_send_hdrincl, we allocate an skb based on the size of the data in the msghdr
that was passed in, but assume the data is all valid.  Later during ipsec
encapsulation, xfrm4_tranport_output moves the entire frame back in the skbuff
to provide headroom for the ipsec headers.  During this operation, the
skb->transport_header is repointed to a spot computed by
skb->network_header + the ip header length (ihl).  Since so little data was
passed in relative to the value of ihl provided by the raw socket, we point
transport header to an unknown location, resulting in various crashes.

This fix for this is pretty straightforward, simply validate the value of of
iph->ihl when sending over a raw socket.  If (iph->ihl*4U) > user data buffer
size, drop the frame and return -EINVAL.  I just confirmed this fixes the
reported crashes.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 757c9171e7c2..ab996f9c0fe0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -352,13 +352,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	skb->ip_summed = CHECKSUM_NONE;
 
 	skb->transport_header = skb->network_header;
-	err = memcpy_fromiovecend((void *)iph, from, 0, length);
-	if (err)
-		goto error_fault;
+	err = -EFAULT;
+	if (memcpy_fromiovecend((void *)iph, from, 0, length))
+		goto error_free;
 
-	/* We don't modify invalid header */
 	iphlen = iph->ihl * 4;
-	if (iphlen >= sizeof(*iph) && iphlen <= length) {
+
+	/*
+	 * We don't want to modify the ip header, but we do need to
+	 * be sure that it won't cause problems later along the network
+	 * stack.  Specifically we want to make sure that iph->ihl is a
+	 * sane value.  If ihl points beyond the length of the buffer passed
+	 * in, reject the frame as invalid
+	 */
+	err = -EINVAL;
+	if (iphlen > length)
+		goto error_free;
+
+	if (iphlen >= sizeof(*iph)) {
 		if (!iph->saddr)
 			iph->saddr = rt->rt_src;
 		iph->check   = 0;
@@ -381,8 +392,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 out:
 	return 0;
 
-error_fault:
-	err = -EFAULT;
+error_free:
 	kfree_skb(skb);
 error:
 	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
-- 
cgit v1.2.2


From b0c110ca8e89f2c9cd52ec7fb1b98c5b7aa78496 Mon Sep 17 00:00:00 2001
From: jamal <hadi@cyberus.ca>
Date: Sun, 18 Oct 2009 02:12:33 +0000
Subject: net: Fix RPF to work with policy routing

Policy routing is not looked up by mark on reverse path filtering.
This fixes it.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 5 ++++-
 net/ipv4/route.c        | 8 ++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e2f950592566..aa00398be80e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -229,14 +229,17 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
  */
 
 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
-			struct net_device *dev, __be32 *spec_dst, u32 *itag)
+			struct net_device *dev, __be32 *spec_dst,
+			u32 *itag, u32 mark)
 {
 	struct in_device *in_dev;
 	struct flowi fl = { .nl_u = { .ip4_u =
 				      { .daddr = src,
 					.saddr = dst,
 					.tos = tos } },
+			    .mark = mark,
 			    .iif = oif };
+
 	struct fib_result res;
 	int no_addr, rpf;
 	int ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bb4199252026..5b1050a5d874 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1854,7 +1854,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			goto e_inval;
 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
 	} else if (fib_validate_source(saddr, 0, tos, 0,
-					dev, &spec_dst, &itag) < 0)
+					dev, &spec_dst, &itag, 0) < 0)
 		goto e_inval;
 
 	rth = dst_alloc(&ipv4_dst_ops);
@@ -1967,7 +1967,7 @@ static int __mkroute_input(struct sk_buff *skb,
 
 
 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
-				  in_dev->dev, &spec_dst, &itag);
+				  in_dev->dev, &spec_dst, &itag, skb->mark);
 	if (err < 0) {
 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
 					 saddr);
@@ -2141,7 +2141,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		int result;
 		result = fib_validate_source(saddr, daddr, tos,
 					     net->loopback_dev->ifindex,
-					     dev, &spec_dst, &itag);
+					     dev, &spec_dst, &itag, skb->mark);
 		if (result < 0)
 			goto martian_source;
 		if (result)
@@ -2170,7 +2170,7 @@ brd_input:
 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
 	else {
 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
-					  &itag);
+					  &itag, skb->mark);
 		if (err < 0)
 			goto martian_source;
 		if (err)
-- 
cgit v1.2.2


From 9d410c796067686b1e032d54ce475b7055537138 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 30 Oct 2009 05:03:53 +0000
Subject: net: fix sk_forward_alloc corruption

On UDP sockets, we must call skb_free_datagram() with socket locked,
or risk sk_forward_alloc corruption. This requirement is not respected
in SUNRPC.

Add a convenient helper, skb_free_datagram_locked() and use it in SUNRPC

Reported-by: Francis Moreau <francis.moro@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d0d436d6216c..0fa9f70e4b19 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -999,9 +999,7 @@ try_again:
 		err = ulen;
 
 out_free:
-	lock_sock(sk);
-	skb_free_datagram(sk, skb);
-	release_sock(sk);
+	skb_free_datagram_locked(sk, skb);
 out:
 	return err;
 
-- 
cgit v1.2.2


From 2e9526b352061ee0fd2a1580a2e3a5af960dabc4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 30 Oct 2009 05:51:48 +0000
Subject: gre: Fix dev_addr clobbering for gretap

Nathan Neulinger noticed that gretap devices get their MAC address
from the local IP address, which results in invalid MAC addresses
half of the time.

This is because gretap is still using the tunnel netdev ops rather
than the correct tap netdev ops struct.

This patch also fixes changelink to not clobber the MAC address
for the gretap case.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Tested-by: Nathan Neulinger <nneul@mst.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 41ada9904d31..143333852624 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1464,7 +1464,7 @@ static void ipgre_tap_setup(struct net_device *dev)
 
 	ether_setup(dev);
 
-	dev->netdev_ops		= &ipgre_netdev_ops;
+	dev->netdev_ops		= &ipgre_tap_netdev_ops;
 	dev->destructor 	= free_netdev;
 
 	dev->iflink		= 0;
@@ -1525,25 +1525,29 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 		if (t->dev != dev)
 			return -EEXIST;
 	} else {
-		unsigned nflags = 0;
-
 		t = nt;
 
-		if (ipv4_is_multicast(p.iph.daddr))
-			nflags = IFF_BROADCAST;
-		else if (p.iph.daddr)
-			nflags = IFF_POINTOPOINT;
+		if (dev->type != ARPHRD_ETHER) {
+			unsigned nflags = 0;
 
-		if ((dev->flags ^ nflags) &
-		    (IFF_POINTOPOINT | IFF_BROADCAST))
-			return -EINVAL;
+			if (ipv4_is_multicast(p.iph.daddr))
+				nflags = IFF_BROADCAST;
+			else if (p.iph.daddr)
+				nflags = IFF_POINTOPOINT;
+
+			if ((dev->flags ^ nflags) &
+			    (IFF_POINTOPOINT | IFF_BROADCAST))
+				return -EINVAL;
+		}
 
 		ipgre_tunnel_unlink(ign, t);
 		t->parms.iph.saddr = p.iph.saddr;
 		t->parms.iph.daddr = p.iph.daddr;
 		t->parms.i_key = p.i_key;
-		memcpy(dev->dev_addr, &p.iph.saddr, 4);
-		memcpy(dev->broadcast, &p.iph.daddr, 4);
+		if (dev->type != ARPHRD_ETHER) {
+			memcpy(dev->dev_addr, &p.iph.saddr, 4);
+			memcpy(dev->broadcast, &p.iph.daddr, 4);
+		}
 		ipgre_tunnel_link(ign, t);
 		netdev_state_change(dev);
 	}
-- 
cgit v1.2.2