From 98bdb0aa007ff7e8e0061936d8d0e210faf2e655 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 25 Jan 2011 08:17:48 -0800
Subject: libceph: fix socket read error handling

If we get EAGAIN when trying to read from the socket, it is not an error.
Return 0 from the helper in this case to simplify the error handling cases
in the caller (indirectly, try_read).

Fix try_read to pass any error to it's caller (con_work) instead of almost
always returning 0.  This let's us respond to things like socket
disconnects.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index dff633d62e5b..d95576d40c98 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -252,8 +252,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
 {
 	struct kvec iov = {buf, len};
 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
 
-	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+	r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
 }
 
 /*
@@ -1821,19 +1825,17 @@ more:
 			dout("try_read connecting\n");
 			ret = read_partial_banner(con);
 			if (ret <= 0)
-				goto done;
-			if (process_banner(con) < 0) {
-				ret = -1;
 				goto out;
-			}
+			ret = process_banner(con);
+			if (ret < 0)
+				goto out;
 		}
 		ret = read_partial_connect(con);
 		if (ret <= 0)
-			goto done;
-		if (process_connect(con) < 0) {
-			ret = -1;
 			goto out;
-		}
+		ret = process_connect(con);
+		if (ret < 0)
+			goto out;
 		goto more;
 	}
 
@@ -1848,7 +1850,7 @@ more:
 		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
 		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
 		if (ret <= 0)
-			goto done;
+			goto out;
 		con->in_base_pos += ret;
 		if (con->in_base_pos)
 			goto more;
@@ -1859,7 +1861,7 @@ more:
 		 */
 		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
 		if (ret <= 0)
-			goto done;
+			goto out;
 		dout("try_read got tag %d\n", (int)con->in_tag);
 		switch (con->in_tag) {
 		case CEPH_MSGR_TAG_MSG:
@@ -1870,7 +1872,7 @@ more:
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
 			set_bit(CLOSED, &con->state);   /* fixme */
-			goto done;
+			goto out;
 		default:
 			goto bad_tag;
 		}
@@ -1882,13 +1884,12 @@ more:
 			case -EBADMSG:
 				con->error_msg = "bad crc";
 				ret = -EIO;
-				goto out;
+				break;
 			case -EIO:
 				con->error_msg = "io error";
-				goto out;
-			default:
-				goto done;
+				break;
 			}
+			goto out;
 		}
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
@@ -1898,15 +1899,13 @@ more:
 	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
 		ret = read_partial_ack(con);
 		if (ret <= 0)
-			goto done;
+			goto out;
 		process_ack(con);
 		goto more;
 	}
 
-done:
-	ret = 0;
 out:
-	dout("try_read done on %p\n", con);
+	dout("try_read done on %p ret %d\n", con, ret);
 	return ret;
 
 bad_tag:
-- 
cgit v1.2.2


From 42961d2333a1855c649fa3790e258ab4f0fa66a4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 25 Jan 2011 08:19:34 -0800
Subject: libceph: fix socket write error handling

Pass errors from writing to the socket up the stack.  If we get -EAGAIN,
return 0 from the helper to simplify the callers' checks.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d95576d40c98..35b36b86d762 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -268,13 +268,17 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
 		     size_t kvlen, size_t len, int more)
 {
 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
 
 	if (more)
 		msg.msg_flags |= MSG_MORE;
 	else
 		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
 
-	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
 }
 
 
@@ -851,6 +855,8 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 		    (msg->pages || msg->pagelist || msg->bio || in_trail))
 			kunmap(page);
 
+		if (ret == -EAGAIN)
+			ret = 0;
 		if (ret <= 0)
 			goto out;
 
@@ -1741,16 +1747,12 @@ more_kvec:
 	if (con->out_skip) {
 		ret = write_partial_skip(con);
 		if (ret <= 0)
-			goto done;
-		if (ret < 0) {
-			dout("try_write write_partial_skip err %d\n", ret);
-			goto done;
-		}
+			goto out;
 	}
 	if (con->out_kvec_left) {
 		ret = write_partial_kvec(con);
 		if (ret <= 0)
-			goto done;
+			goto out;
 	}
 
 	/* msg pages? */
@@ -1765,11 +1767,11 @@ more_kvec:
 		if (ret == 1)
 			goto more_kvec;  /* we need to send the footer, too! */
 		if (ret == 0)
-			goto done;
+			goto out;
 		if (ret < 0) {
 			dout("try_write write_partial_msg_pages err %d\n",
 			     ret);
-			goto done;
+			goto out;
 		}
 	}
 
@@ -1793,10 +1795,9 @@ do_next:
 	/* Nothing to do! */
 	clear_bit(WRITE_PENDING, &con->state);
 	dout("try_write nothing else to write.\n");
-done:
 	ret = 0;
 out:
-	dout("try_write done on %p\n", con);
+	dout("try_write done on %p ret %d\n", con, ret);
 	return ret;
 }
 
-- 
cgit v1.2.2


From e733fb62082b3b187870dfba28d5f6730b8436c4 Mon Sep 17 00:00:00 2001
From: Bao Liang <tim.bao@gmail.com>
Date: Sat, 29 Jan 2011 21:39:37 +0800
Subject: Bluetooth: Set conn state to BT_DISCONN to avoid multiple responses

This patch fixes a minor issue that two connection responses will be sent
for one L2CAP connection request. If the L2CAP connection request is first
blocked due to security reason and responded with reason "security block",
the state of the connection remains BT_CONNECT2. If a pairing procedure
completes successfully before the ACL connection is down, local host will
send another connection complete response. See the following packets
captured by hcidump.

2010-12-07 22:21:24.928096 < ACL data: handle 12 flags 0x00 dlen 16
    0000: 0c 00 01 00 03 19 08 00  41 00 53 00 03 00 00 00  ........A.S.....
... ...

2010-12-07 22:21:35.791747 > HCI Event: Auth Complete (0x06) plen 3
    status 0x00 handle 12
... ...

2010-12-07 22:21:35.872372 > ACL data: handle 12 flags 0x02 dlen 16
    L2CAP(s): Connect rsp: dcid 0x0054 scid 0x0040 result 0 status 0
      Connection successful

Signed-off-by: Liang Bao <tim.bao@gmail.com>
Acked-by: Ville Tervo <ville.tervo@nokia.com>
Signed-off-by: Gustavo F. Padovan <padovan@profusion.mobi>
---
 net/bluetooth/l2cap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 7550abb0c96a..675614e38e14 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -859,6 +859,7 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
 				result = L2CAP_CR_SEC_BLOCK;
 			else
 				result = L2CAP_CR_BAD_PSM;
+			sk->sk_state = BT_DISCONN;
 
 			rsp.scid   = cpu_to_le16(l2cap_pi(sk)->dcid);
 			rsp.dcid   = cpu_to_le16(l2cap_pi(sk)->scid);
-- 
cgit v1.2.2


From a7b545f7fe753ca3dc1b51ca57f90cd59d974e44 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliad@wizery.com>
Date: Tue, 8 Feb 2011 18:43:19 +0200
Subject: mac80211: add missing locking in ieee80211_reconfig

When suspending an associated system, and then resuming,
the station vif is being reconfigured without taking the
sdata->u.mgd.mtx lock, which results in the following warning:

WARNING: at net/mac80211/mlme.c:101 ieee80211_ap_probereq_get+0x58/0xb8 [mac80211]()
Modules linked in: wl12xx_sdio wl12xx firmware_class crc7 mac80211 cfg80211 [last unloaded: crc7]
Backtrace:
[<c005432c>] (dump_backtrace+0x0/0x118) from [<c0376e28>] (dump_stack+0x20/0x24)
 r7:00000000 r6:bf12d6ec r5:bf154aac r4:00000065
[<c0376e08>] (dump_stack+0x0/0x24) from [<c0079104>] (warn_slowpath_common+0x5c/0x74)
[<c00790a8>] (warn_slowpath_common+0x0/0x74) from [<c0079148>] (warn_slowpath_null+0x2c/0x34)
 r9:000024ff r8:cd006460 r7:00000001 r6:00000000 r5:00000000
r4:cf1394a0
[<c007911c>] (warn_slowpath_null+0x0/0x34) from [<bf12d6ec>] (ieee80211_ap_probereq_get+0x58/0xb8 [mac80211])
[<bf12d694>] (ieee80211_ap_probereq_get+0x0/0xb8 [mac80211]) from [<bf19cd04>] (wl1271_cmd_build_ap_probe_req+0x30/0xf8 [wl12xx])
 r4:cd007440
[<bf19ccd4>] (wl1271_cmd_build_ap_probe_req+0x0/0xf8 [wl12xx]) from [<bf1995f4>] (wl1271_op_bss_info_changed+0x4c4/0x808 [wl12xx])
 r5:cd007440 r4:000003b4
[<bf199130>] (wl1271_op_bss_info_changed+0x0/0x808 [wl12xx]) from [<bf122168>] (ieee80211_bss_info_change_notify+0x1a4/0x1f8 [mac80211])
[<bf121fc4>] (ieee80211_bss_info_change_notify+0x0/0x1f8 [mac80211]) from [<bf141e80>] (ieee80211_reconfig+0x4d0/0x668 [mac80211])
 r8:cf0eeea4 r7:cd00671c r6:00000000 r5:cd006460 r4:cf1394a0
[<bf1419b0>] (ieee80211_reconfig+0x0/0x668 [mac80211]) from [<bf137dd4>] (ieee80211_resume+0x60/0x70 [mac80211])
[<bf137d74>] (ieee80211_resume+0x0/0x70 [mac80211]) from [<bf0eb930>] (wiphy_resume+0x6c/0x7c [cfg80211])
 r5:cd006248 r4:cd006110
[<bf0eb8c4>] (wiphy_resume+0x0/0x7c [cfg80211]) from [<c0241024>] (legacy_resume+0x38/0x70)
 r7:00000000 r6:00000000 r5:cd006248 r4:cd0062fc
[<c0240fec>] (legacy_resume+0x0/0x70) from [<c0241478>] (device_resume+0x168/0x1a0)
 r8:c04ca8d8 r7:cd00627c r6:00000010 r5:cd006248 r4:cd0062fc
[<c0241310>] (device_resume+0x0/0x1a0) from [<c0241600>] (dpm_resume_end+0xf8/0x3bc)
 r7:00000000 r6:00000005 r5:cd006248 r4:cd0062fc
[<c0241508>] (dpm_resume_end+0x0/0x3bc) from [<c00b2a24>] (suspend_devices_and_enter+0x1b0/0x204)
[<c00b2874>] (suspend_devices_and_enter+0x0/0x204) from [<c00b2b68>] (enter_state+0xf0/0x148)
 r7:c037e978 r6:00000003 r5:c043d807 r4:00000000
[<c00b2a78>] (enter_state+0x0/0x148) from [<c00b20a4>] (state_store+0xa4/0xcc)
 r7:c037e978 r6:00000003 r5:00000003 r4:c043d807
[<c00b2000>] (state_store+0x0/0xcc) from [<c01fc90c>] (kobj_attr_store+0x20/0x24)
[<c01fc8ec>] (kobj_attr_store+0x0/0x24) from [<c0157120>] (sysfs_write_file+0x11c/0x150)
[<c0157004>] (sysfs_write_file+0x0/0x150) from [<c0100f84>] (vfs_write+0xc0/0x14c)
[<c0100ec4>] (vfs_write+0x0/0x14c) from [<c01010e4>] (sys_write+0x4c/0x78)
 r8:40126000 r7:00000004 r6:cf1a7c80 r5:00000000 r4:00000000
[<c0101098>] (sys_write+0x0/0x78) from [<c00500c0>] (ret_fast_syscall+0x0/0x30)
 r8:c00502c8 r7:00000004 r6:403525e8 r5:40126000 r4:00000004

Signed-off-by: Eliad Peller <eliad@wizery.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/util.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index cf68700abffa..d036597aabbe 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1210,7 +1210,9 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_STATION:
 			changed |= BSS_CHANGED_ASSOC;
+			mutex_lock(&sdata->u.mgd.mtx);
 			ieee80211_bss_info_change_notify(sdata, changed);
+			mutex_unlock(&sdata->u.mgd.mtx);
 			break;
 		case NL80211_IFTYPE_ADHOC:
 			changed |= BSS_CHANGED_IBSS;
-- 
cgit v1.2.2


From 0b150932197b185ad5816932912e648116c7a96a Mon Sep 17 00:00:00 2001
From: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Date: Thu, 10 Feb 2011 23:08:33 -0800
Subject: xfrm: avoid possible oopse in xfrm_alloc_dst

Commit 80c802f3073e84 (xfrm: cache bundles instead of policies for
outgoing flows) introduced possible oopse when dst_alloc returns NULL.

Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_policy.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8b3ef404c794..6459588befc3 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1340,10 +1340,13 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 	default:
 		BUG();
 	}
-	xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS);
+	xdst = dst_alloc(dst_ops);
 	xfrm_policy_put_afinfo(afinfo);
 
-	xdst->flo.ops = &xfrm_bundle_fc_ops;
+	if (likely(xdst))
+		xdst->flo.ops = &xfrm_bundle_fc_ops;
+	else
+		xdst = ERR_PTR(-ENOBUFS);
 
 	return xdst;
 }
-- 
cgit v1.2.2


From 946bf5ee3c46f73b5cbd52aab594697b1a132d1f Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 11 Feb 2011 11:21:57 -0800
Subject: ip_gre: Add IPPROTO_GRE to flowi in ipgre_tunnel_xmit

Commit 5811662b15db018c740c57d037523683fd3e6123 ("net: use the macros
defined for the members of flowi") accidentally removed the setting of
IPPROTO_GRE from the struct flowi in ipgre_tunnel_xmit. This patch
restores it.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index eb68a0e34e49..6613edfac28c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -775,6 +775,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			.fl4_dst = dst,
 			.fl4_src = tiph->saddr,
 			.fl4_tos = RT_TOS(tos),
+			.proto = IPPROTO_GRE,
 			.fl_gre_key = tunnel->parms.o_key
 		};
 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
-- 
cgit v1.2.2


From 6b0d6a9b4296fa16a28d10d416db7a770fc03287 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 11 Feb 2011 12:36:55 +0000
Subject: bridge: Fix mglist corruption that leads to memory corruption

The list mp->mglist is used to indicate whether a multicast group
is active on the bridge interface itself as opposed to one of the
constituent interfaces in the bridge.

Unfortunately the operation that adds the mp->mglist node to the
list neglected to check whether it has already been added.  This
leads to list corruption in the form of nodes pointing to itself.

Normally this would be quite obvious as it would cause an infinite
loop when walking the list.  However, as this list is never actually
walked (which means that we don't really need it, I'll get rid of
it in a subsequent patch), this instead is hidden until we perform
a delete operation on the affected nodes.

As the same node may now be pointed to by more than one node, the
delete operations can then cause modification of freed memory.

This was observed in practice to cause corruption in 512-byte slabs,
most commonly leading to crashes in jbd2.

Thanks to Josef Bacik for pointing me in the right direction.

Reported-by: Ian Page Hands <ihands@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f701a21acb34..fdbd41c76ec4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -719,7 +719,8 @@ static int br_multicast_add_group(struct net_bridge *br,
 		goto err;
 
 	if (!port) {
-		hlist_add_head(&mp->mglist, &br->mglist);
+		if (hlist_unhashed(&mp->mglist))
+			hlist_add_head(&mp->mglist, &br->mglist);
 		mod_timer(&mp->timer, now + br->multicast_membership_interval);
 		goto out;
 	}
-- 
cgit v1.2.2


From 24f9cdcbd743fd6adb8fb83688d8d86dcccde662 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 11 Feb 2011 12:42:07 +0000
Subject: bridge: Fix timer typo that may render snooping less effective

In a couple of spots where we are supposed to modify the port
group timer (p->timer) we instead modify the bridge interface
group timer (mp->timer).

The effect of this is mostly harmless.  However, it can cause
port subscriptions to be longer than they should be, thus making
snooping less effective.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index fdbd41c76ec4..c558274051eb 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1178,7 +1178,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 		if (timer_pending(&p->timer) ?
 		    time_after(p->timer.expires, now + max_delay) :
 		    try_to_del_timer_sync(&p->timer) >= 0)
-			mod_timer(&mp->timer, now + max_delay);
+			mod_timer(&p->timer, now + max_delay);
 	}
 
 out:
@@ -1249,7 +1249,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 		if (timer_pending(&p->timer) ?
 		    time_after(p->timer.expires, now + max_delay) :
 		    try_to_del_timer_sync(&p->timer) >= 0)
-			mod_timer(&mp->timer, now + max_delay);
+			mod_timer(&p->timer, now + max_delay);
 	}
 
 out:
-- 
cgit v1.2.2


From 8a870178c0ad1bae9994c99bd01eb10c9903e616 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 12 Feb 2011 01:05:42 -0800
Subject: bridge: Replace mp->mglist hlist with a bool

As it turns out we never need to walk through the list of multicast
groups subscribed by the bridge interface itself (the only time we'd
want to do that is when we shut down the bridge, in which case we
simply walk through all multicast groups), we don't really need to
keep an hlist for mp->mglist.

This means that we can replace it with just a single bit to indicate
whether the bridge interface is subscribed to a group.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_input.c     |  2 +-
 net/bridge/br_multicast.c | 16 +++++++---------
 net/bridge/br_private.h   |  3 +--
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 6f6d8e1b776f..88e4aa9cb1f9 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -80,7 +80,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	if (is_multicast_ether_addr(dest)) {
 		mdst = br_mdb_get(br, skb);
 		if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
-			if ((mdst && !hlist_unhashed(&mdst->mglist)) ||
+			if ((mdst && mdst->mglist) ||
 			    br_multicast_is_router(br))
 				skb2 = skb;
 			br_multicast_forward(mdst, skb, skb2);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index c558274051eb..09d5c0987925 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -232,8 +232,7 @@ static void br_multicast_group_expired(unsigned long data)
 	if (!netif_running(br->dev) || timer_pending(&mp->timer))
 		goto out;
 
-	if (!hlist_unhashed(&mp->mglist))
-		hlist_del_init(&mp->mglist);
+	mp->mglist = false;
 
 	if (mp->ports)
 		goto out;
@@ -276,7 +275,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
 		del_timer(&p->query_timer);
 		call_rcu_bh(&p->rcu, br_multicast_free_pg);
 
-		if (!mp->ports && hlist_unhashed(&mp->mglist) &&
+		if (!mp->ports && !mp->mglist &&
 		    netif_running(br->dev))
 			mod_timer(&mp->timer, jiffies);
 
@@ -528,7 +527,7 @@ static void br_multicast_group_query_expired(unsigned long data)
 	struct net_bridge *br = mp->br;
 
 	spin_lock(&br->multicast_lock);
-	if (!netif_running(br->dev) || hlist_unhashed(&mp->mglist) ||
+	if (!netif_running(br->dev) || !mp->mglist ||
 	    mp->queries_sent >= br->multicast_last_member_count)
 		goto out;
 
@@ -719,8 +718,7 @@ static int br_multicast_add_group(struct net_bridge *br,
 		goto err;
 
 	if (!port) {
-		if (hlist_unhashed(&mp->mglist))
-			hlist_add_head(&mp->mglist, &br->mglist);
+		mp->mglist = true;
 		mod_timer(&mp->timer, now + br->multicast_membership_interval);
 		goto out;
 	}
@@ -1166,7 +1164,7 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 
 	max_delay *= br->multicast_last_member_count;
 
-	if (!hlist_unhashed(&mp->mglist) &&
+	if (mp->mglist &&
 	    (timer_pending(&mp->timer) ?
 	     time_after(mp->timer.expires, now + max_delay) :
 	     try_to_del_timer_sync(&mp->timer) >= 0))
@@ -1237,7 +1235,7 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 		goto out;
 
 	max_delay *= br->multicast_last_member_count;
-	if (!hlist_unhashed(&mp->mglist) &&
+	if (mp->mglist &&
 	    (timer_pending(&mp->timer) ?
 	     time_after(mp->timer.expires, now + max_delay) :
 	     try_to_del_timer_sync(&mp->timer) >= 0))
@@ -1284,7 +1282,7 @@ static void br_multicast_leave_group(struct net_bridge *br,
 		     br->multicast_last_member_interval;
 
 	if (!port) {
-		if (!hlist_unhashed(&mp->mglist) &&
+		if (mp->mglist &&
 		    (timer_pending(&mp->timer) ?
 		     time_after(mp->timer.expires, time) :
 		     try_to_del_timer_sync(&mp->timer) >= 0)) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 84aac7734bfc..4e1b620b6be6 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -84,13 +84,13 @@ struct net_bridge_port_group {
 struct net_bridge_mdb_entry
 {
 	struct hlist_node		hlist[2];
-	struct hlist_node		mglist;
 	struct net_bridge		*br;
 	struct net_bridge_port_group __rcu *ports;
 	struct rcu_head			rcu;
 	struct timer_list		timer;
 	struct timer_list		query_timer;
 	struct br_ip			addr;
+	bool				mglist;
 	u32				queries_sent;
 };
 
@@ -238,7 +238,6 @@ struct net_bridge
 	spinlock_t			multicast_lock;
 	struct net_bridge_mdb_htable __rcu *mdb;
 	struct hlist_head		router_list;
-	struct hlist_head		mglist;
 
 	struct timer_list		multicast_router_timer;
 	struct timer_list		multicast_querier_timer;
-- 
cgit v1.2.2


From 7ec79270d7de0c8ca602c47cb25a9652ec28f37f Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Mon, 31 Jan 2011 12:00:59 +0000
Subject: net: dcb: application priority is per net_device

The app_data priority may not be the same for all net devices.
In order for stacks with application notifiers to identify the
specific net device dcb_app_type should be passed in the ptr.

This allows handlers to use dev_get_by_name() to pin priority
to net devices.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dcb/dcbnl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 6b03f561caec..712ca026040a 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1613,6 +1613,10 @@ EXPORT_SYMBOL(dcb_getapp);
 u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
 {
 	struct dcb_app_type *itr;
+	struct dcb_app_type event;
+
+	memcpy(&event.name, dev->name, sizeof(event.name));
+	memcpy(&event.app, new, sizeof(event.app));
 
 	spin_lock(&dcb_lock);
 	/* Search for existing match and replace */
@@ -1644,7 +1648,7 @@ u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
 	}
 out:
 	spin_unlock(&dcb_lock);
-	call_dcbevent_notifiers(DCB_APP_EVENT, new);
+	call_dcbevent_notifiers(DCB_APP_EVENT, &event);
 	return 0;
 }
 EXPORT_SYMBOL(dcb_setapp);
-- 
cgit v1.2.2


From d3337de52af7fb0ebe605b02b740be4ee7dee9eb Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 10 Feb 2011 11:57:16 +0000
Subject: Don't potentially dereference NULL in net/dcb/dcbnl.c:dcbnl_getapp()

nla_nest_start() may return NULL. If it does then we'll blow up in
nla_nest_end() when we dereference the pointer.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dcb/dcbnl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 712ca026040a..d5074a567289 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -626,6 +626,9 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
 	dcb->cmd = DCB_CMD_GAPP;
 
 	app_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_APP);
+	if (!app_nest)
+		goto out_cancel;
+
 	ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_IDTYPE, idtype);
 	if (ret)
 		goto out_cancel;
-- 
cgit v1.2.2


From de9963f0f2dfad128b26ae7bf6005f5948416a6d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 14 Feb 2011 17:35:07 +0100
Subject: netfilter: nf_iterate: fix incorrect RCU usage

As noticed by Eric, nf_iterate doesn't use RCU correctly by
accessing the prev pointer of a RCU protected list element when
a verdict of NF_REPEAT is issued.

Fix by jumping backwards to the hook invocation directly instead
of loading the previous list element before continuing the list
iteration.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 32fcbe290c04..4aa614b8a96a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -133,6 +133,7 @@ unsigned int nf_iterate(struct list_head *head,
 
 		/* Optimization: we don't need to hold module
 		   reference here, since function can't sleep. --RR */
+repeat:
 		verdict = elem->hook(hook, skb, indev, outdev, okfn);
 		if (verdict != NF_ACCEPT) {
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -145,7 +146,7 @@ unsigned int nf_iterate(struct list_head *head,
 #endif
 			if (verdict != NF_REPEAT)
 				return verdict;
-			*i = (*i)->prev;
+			goto repeat;
 		}
 	}
 	return NF_ACCEPT;
-- 
cgit v1.2.2


From d11327ad6695db8117c78d70611e71102ceec2ac Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Fri, 11 Feb 2011 07:44:16 +0000
Subject: arp_notify: unconditionally send gratuitous ARP for
 NETDEV_NOTIFY_PEERS.

NETDEV_NOTIFY_PEER is an explicit request by the driver to send a link
notification while NETDEV_UP/NETDEV_CHANGEADDR generate link
notifications as a sort of side effect.

In the later cases the sysctl option is present because link
notification events can have undesired effects e.g. if the link is
flapping. I don't think this applies in the case of an explicit
request from a driver.

This patch makes NETDEV_NOTIFY_PEER unconditional, if preferred we
could add a new sysctl for this case which defaults to on.

This change causes Xen post-migration ARP notifications (which cause
switches to relearn their MAC tables etc) to be sent by default.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 748cb5b337bd..df4616fce929 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1030,6 +1030,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu)
 	return mtu >= 68;
 }
 
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+					struct in_device *in_dev)
+
+{
+	struct in_ifaddr *ifa = in_dev->ifa_list;
+
+	if (!ifa)
+		return;
+
+	arp_send(ARPOP_REQUEST, ETH_P_ARP,
+		 ifa->ifa_address, dev,
+		 ifa->ifa_address, NULL,
+		 dev->dev_addr, NULL);
+}
+
 /* Called only under RTNL semaphore */
 
 static int inetdev_event(struct notifier_block *this, unsigned long event,
@@ -1082,18 +1097,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 		}
 		ip_mc_up(in_dev);
 		/* fall through */
-	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_CHANGEADDR:
+		if (!IN_DEV_ARP_NOTIFY(in_dev))
+			break;
+		/* fall through */
+	case NETDEV_NOTIFY_PEERS:
 		/* Send gratuitous ARP to notify of link change */
-		if (IN_DEV_ARP_NOTIFY(in_dev)) {
-			struct in_ifaddr *ifa = in_dev->ifa_list;
-
-			if (ifa)
-				arp_send(ARPOP_REQUEST, ETH_P_ARP,
-					 ifa->ifa_address, dev,
-					 ifa->ifa_address, NULL,
-					 dev->dev_addr, NULL);
-		}
+		inetdev_send_gratuitous_arp(dev, in_dev);
 		break;
 	case NETDEV_DOWN:
 		ip_mc_down(in_dev);
-- 
cgit v1.2.2


From 840af824b2bf9194ea596e0ddc7aa05066794ca1 Mon Sep 17 00:00:00 2001
From: Vladislav P <vladisslav@inbox.ru>
Date: Mon, 14 Feb 2011 15:21:50 -0200
Subject: Bluetooth: Release BTM while sleeping to avoid deadlock

Signed-off-by: Vladislav P <vladisslav@inbox.ru>
Signed-off-by: Gustavo F. Padovan <padovan@profusion.mobi>
---
 net/bluetooth/rfcomm/tty.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 2575c2db6404..d7b9af4703d0 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -727,7 +727,9 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
 			break;
 		}
 
+		tty_unlock();
 		schedule();
+		tty_lock();
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&dev->wait, &wait);
-- 
cgit v1.2.2


From d503b30bd648b3cb4e5f50b65d27e389960cc6d9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fwestphal@astaro.com>
Date: Thu, 17 Feb 2011 11:32:38 +0100
Subject: netfilter: tproxy: do not assign timewait sockets to skb->sk

Assigning a socket in timewait state to skb->sk can trigger
kernel oops, e.g. in nfnetlink_log, which does:

if (skb->sk) {
        read_lock_bh(&skb->sk->sk_callback_lock);
        if (skb->sk->sk_socket && skb->sk->sk_socket->file) ...

in the timewait case, accessing sk->sk_callback_lock and sk->sk_socket
is invalid.

Either all of these spots will need to add a test for sk->sk_state != TCP_TIME_WAIT,
or xt_TPROXY must not assign a timewait socket to skb->sk.

This does the latter.

If a TW socket is found, assign the tproxy nfmark, but skip the skb->sk assignment,
thus mimicking behaviour of a '-m socket .. -j MARK/ACCEPT' re-routing rule.

The 'SYN to TW socket' case is left unchanged -- we try to redirect to the
listener socket.

Cc: Balazs Scheidler <bazsi@balabit.hu>
Cc: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Florian Westphal <fwestphal@astaro.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_tproxy_core.c | 27 ++++++++++++---------------
 net/netfilter/xt_TPROXY.c      | 22 ++++++++++++++++++++--
 net/netfilter/xt_socket.c      | 13 +++++++++++--
 3 files changed, 43 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 4d87befb04c0..474d621cbc2e 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -28,26 +28,23 @@ nf_tproxy_destructor(struct sk_buff *skb)
 	skb->destructor = NULL;
 
 	if (sk)
-		nf_tproxy_put_sock(sk);
+		sock_put(sk);
 }
 
 /* consumes sk */
-int
+void
 nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
 {
-	bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
-				inet_twsk(sk)->tw_transparent :
-				inet_sk(sk)->transparent;
-
-	if (transparent) {
-		skb_orphan(skb);
-		skb->sk = sk;
-		skb->destructor = nf_tproxy_destructor;
-		return 1;
-	} else
-		nf_tproxy_put_sock(sk);
-
-	return 0;
+	/* assigning tw sockets complicates things; most
+	 * skb->sk->X checks would have to test sk->sk_state first */
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		inet_twsk_put(inet_twsk(sk));
+		return;
+	}
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = nf_tproxy_destructor;
 }
 EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock);
 
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 640678f47a2a..dcfd57eb9d02 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,6 +33,20 @@
 #include <net/netfilter/nf_tproxy_core.h>
 #include <linux/netfilter/xt_TPROXY.h>
 
+static bool tproxy_sk_is_transparent(struct sock *sk)
+{
+	if (sk->sk_state != TCP_TIME_WAIT) {
+		if (inet_sk(sk)->transparent)
+			return true;
+		sock_put(sk);
+	} else {
+		if (inet_twsk(sk)->tw_transparent)
+			return true;
+		inet_twsk_put(inet_twsk(sk));
+	}
+	return false;
+}
+
 static inline __be32
 tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 {
@@ -141,7 +155,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 					   skb->dev, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
-	if (sk && nf_tproxy_assign_sock(skb, sk)) {
+	if (sk && tproxy_sk_is_transparent(sk)) {
 		/* This should be in a separate target, but we don't do multiple
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -149,6 +163,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 		pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
 			 iph->protocol, &iph->daddr, ntohs(hp->dest),
 			 &laddr, ntohs(lport), skb->mark);
+
+		nf_tproxy_assign_sock(skb, sk);
 		return NF_ACCEPT;
 	}
 
@@ -306,7 +322,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 					   par->in, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
-	if (sk && nf_tproxy_assign_sock(skb, sk)) {
+	if (sk && tproxy_sk_is_transparent(sk)) {
 		/* This should be in a separate target, but we don't do multiple
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
@@ -314,6 +330,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
 			 tproto, &iph->saddr, ntohs(hp->source),
 			 laddr, ntohs(lport), skb->mark);
+
+		nf_tproxy_assign_sock(skb, sk);
 		return NF_ACCEPT;
 	}
 
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 00d6ae838303..9cc46356b577 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -35,6 +35,15 @@
 #include <net/netfilter/nf_conntrack.h>
 #endif
 
+static void
+xt_socket_put_sk(struct sock *sk)
+{
+	if (sk->sk_state == TCP_TIME_WAIT)
+		inet_twsk_put(inet_twsk(sk));
+	else
+		sock_put(sk);
+}
+
 static int
 extract_icmp4_fields(const struct sk_buff *skb,
 		    u8 *protocol,
@@ -164,7 +173,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 				       (sk->sk_state == TCP_TIME_WAIT &&
 					inet_twsk(sk)->tw_transparent));
 
-		nf_tproxy_put_sock(sk);
+		xt_socket_put_sk(sk);
 
 		if (wildcard || !transparent)
 			sk = NULL;
@@ -298,7 +307,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
 				       (sk->sk_state == TCP_TIME_WAIT &&
 					inet_twsk(sk)->tw_transparent));
 
-		nf_tproxy_put_sock(sk);
+		xt_socket_put_sk(sk);
 
 		if (wildcard || !transparent)
 			sk = NULL;
-- 
cgit v1.2.2


From 0af320fb4627033e49cbc6e8138e7aa75ab8352a Mon Sep 17 00:00:00 2001
From: Joerg Marx <joerg.marx@secunet.com>
Date: Thu, 17 Feb 2011 16:23:40 +0100
Subject: netfilter: ip6t_LOG: fix a flaw in printing the MAC

The flaw was in skipping the second byte in MAC header due to increasing
the pointer AND indexed access starting at '1'.

Signed-off-by: Joerg Marx <joerg.marx@secunet.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/ip6t_LOG.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 09c88891a753..de338037a736 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -410,7 +410,7 @@ fallback:
 		if (p != NULL) {
 			sb_add(m, "%02x", *p++);
 			for (i = 1; i < len; i++)
-				sb_add(m, ":%02x", p[i]);
+				sb_add(m, ":%02x", *p++);
 		}
 		sb_add(m, " ");
 
-- 
cgit v1.2.2


From 214f45c91bbda8321d9676f1197238e4663edcbb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 18 Feb 2011 11:39:01 -0800
Subject: net: provide default_advmss() methods to blackhole dst_ops

Commit 0dbaee3b37e118a (net: Abstract default ADVMSS behind an
accessor.) introduced a possible crash in tcp_connect_init(), when
dst->default_advmss() is called from dst_metric_advmss()

Reported-by: George Spelvin <linux@horizon.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 1 +
 net/ipv6/route.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 788a3e74834e..6ed6603c2f6d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2722,6 +2722,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
 	.destroy		=	ipv4_dst_destroy,
 	.check			=	ipv4_blackhole_dst_check,
 	.default_mtu		=	ipv4_blackhole_default_mtu,
+	.default_advmss		=	ipv4_default_advmss,
 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
 };
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1c29f95695de..a998db6e7895 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -128,6 +128,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
 	.destroy		=	ip6_dst_destroy,
 	.check			=	ip6_dst_check,
 	.default_mtu		=	ip6_blackhole_default_mtu,
+	.default_advmss		=	ip6_default_advmss,
 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
 };
 
-- 
cgit v1.2.2


From f87e6f47933e3ebeced9bb12615e830a72cedce4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 17 Feb 2011 22:54:38 +0000
Subject: net: dont leave active on stack LIST_HEAD

Eric W. Biderman and Michal Hocko reported various memory corruptions
that we suspected to be related to a LIST head located on stack, that
was manipulated after thread left function frame (and eventually exited,
so its stack was freed and reused).

Eric Dumazet suggested the problem was probably coming from commit
443457242beb (net: factorize
sync-rcu call in unregister_netdevice_many)

This patch fixes __dev_close() and dev_close() to properly deinit their
respective LIST_HEAD(single) before exiting.

References: https://lkml.org/lkml/2011/2/16/304
References: https://lkml.org/lkml/2011/2/14/223

Reported-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Michal Hocko <mhocko@suse.cz>
Reported-by: Eric W. Biderman <ebiderman@xmission.com>
Tested-by: Eric W. Biderman <ebiderman@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8e726cb47ed7..a18c1643ea9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1280,10 +1280,13 @@ static int __dev_close_many(struct list_head *head)
 
 static int __dev_close(struct net_device *dev)
 {
+	int retval;
 	LIST_HEAD(single);
 
 	list_add(&dev->unreg_list, &single);
-	return __dev_close_many(&single);
+	retval = __dev_close_many(&single);
+	list_del(&single);
+	return retval;
 }
 
 int dev_close_many(struct list_head *head)
@@ -1325,7 +1328,7 @@ int dev_close(struct net_device *dev)
 
 	list_add(&dev->unreg_list, &single);
 	dev_close_many(&single);
-
+	list_del(&single);
 	return 0;
 }
 EXPORT_SYMBOL(dev_close);
-- 
cgit v1.2.2


From ceaaec98ad99859ac90ac6863ad0a6cd075d8e0e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 17 Feb 2011 22:59:19 +0000
Subject: net: deinit automatic LIST_HEAD

commit 9b5e383c11b08784 (net: Introduce
unregister_netdevice_many()) left an active LIST_HEAD() in
rollback_registered(), with possible memory corruption.

Even if device is freed without touching its unreg_list (and therefore
touching the previous memory location holding LISTE_HEAD(single), better
close the bug for good, since its really subtle.

(Same fix for default_device_exit_batch() for completeness)

Reported-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Michal Hocko <mhocko@suse.cz>
Reported-by: Eric W. Biderman <ebiderman@xmission.com>
Tested-by: Eric W. Biderman <ebiderman@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Octavian Purdila <opurdila@ixiacom.com>
CC: stable <stable@kernel.org> [.33+]
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index a18c1643ea9f..8ae6631abcc2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5066,6 +5066,7 @@ static void rollback_registered(struct net_device *dev)
 
 	list_add(&dev->unreg_list, &single);
 	rollback_registered_many(&single);
+	list_del(&single);
 }
 
 unsigned long netdev_fix_features(unsigned long features, const char *name)
@@ -6219,6 +6220,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
 		}
 	}
 	unregister_netdevice_many(&dev_kill_list);
+	list_del(&dev_kill_list);
 	rtnl_unlock();
 }
 
-- 
cgit v1.2.2


From 05e7c99136554789e4cc060a63334ccaa08ad62d Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Fri, 18 Feb 2011 09:05:08 +0100
Subject: mac80211: fix conn_mon_timer running after disassociate

Low level driver could pass rx frames to us after disassociate, what
can lead to run conn_mon_timer by ieee80211_sta_rx_notify(). That
is obviously wrong, but nothing happens until we unload modules and
resources are used after free. If kernel debugging is enabled following
warning could be observed:

WARNING: at lib/debugobjects.c:259 debug_print_object+0x65/0x70()
Hardware name: HP xw8600 Workstation
ODEBUG: free active (active state 0) object type: timer_list
Modules linked in: iwlagn(-) iwlcore mac80211 cfg80211 aes_x86_64 aes_generic fuse cpufreq_ondemand acpi_cpufreq freq_table mperf xt_physdev ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod uinput hp_wmi sparse_keymap sg wmi arc4 microcode serio_raw ecb tg3 shpchp rfkill ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif firewire_ohci firewire_core crc_itu_t mptsas mptscsih mptbase scsi_transport_sas ahci libahci pata_acpi ata_generic ata_piix floppy nouveau ttm drm_kms_helper drm i2c_algo_bit i2c_core video [last unloaded: cfg80211]
Pid: 13827, comm: rmmod Tainted: G        W   2.6.38-rc4-wl+ #22
Call Trace:
 [<ffffffff810649cf>] ? warn_slowpath_common+0x7f/0xc0
 [<ffffffff81064ac6>] ? warn_slowpath_fmt+0x46/0x50
 [<ffffffff81226fc5>] ? debug_print_object+0x65/0x70
 [<ffffffff81227625>] ? debug_check_no_obj_freed+0x125/0x210
 [<ffffffff8109ebd7>] ? debug_check_no_locks_freed+0xf7/0x170
 [<ffffffff81156092>] ? kfree+0xc2/0x2f0
 [<ffffffff813ec5c5>] ? netdev_release+0x45/0x60
 [<ffffffff812f1067>] ? device_release+0x27/0xa0
 [<ffffffff81216ddd>] ? kobject_release+0x8d/0x1a0
 [<ffffffff81216d50>] ? kobject_release+0x0/0x1a0
 [<ffffffff812183b7>] ? kref_put+0x37/0x70
 [<ffffffff81216c57>] ? kobject_put+0x27/0x60
 [<ffffffff813d5d1b>] ? netdev_run_todo+0x1ab/0x270
 [<ffffffff813e771e>] ? rtnl_unlock+0xe/0x10
 [<ffffffffa0581188>] ? ieee80211_unregister_hw+0x58/0x120 [mac80211]
 [<ffffffffa0377ed7>] ? iwl_pci_remove+0xdb/0x22a [iwlagn]
 [<ffffffff8123cde2>] ? pci_device_remove+0x52/0x120
 [<ffffffff812f5205>] ? __device_release_driver+0x75/0xe0
 [<ffffffff812f5348>] ? driver_detach+0xd8/0xe0
 [<ffffffff812f4111>] ? bus_remove_driver+0x91/0x100
 [<ffffffff812f5b62>] ? driver_unregister+0x62/0xa0
 [<ffffffff8123d194>] ? pci_unregister_driver+0x44/0xa0
 [<ffffffffa0377df5>] ? iwl_exit+0x15/0x1c [iwlagn]
 [<ffffffff810ab492>] ? sys_delete_module+0x1a2/0x270
 [<ffffffff81498889>] ? trace_hardirqs_on_thunk+0x3a/0x3f
 [<ffffffff8100bf42>] ? system_call_fastpath+0x16/0x1b

Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mlme.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 45fbb9e33746..c9ceb4d57ab0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1033,6 +1033,12 @@ void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
 	if (is_multicast_ether_addr(hdr->addr1))
 		return;
 
+	/*
+	 * In case we receive frames after disassociation.
+	 */
+	if (!sdata->u.mgd.associated)
+		return;
+
 	ieee80211_sta_reset_conn_monitor(sdata);
 }
 
-- 
cgit v1.2.2


From 91035f0b7d89291af728b6f3e370c3be58fcbe1b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 18 Feb 2011 22:35:56 +0000
Subject: tcp: fix inet_twsk_deschedule()

Eric W. Biederman reported a lockdep splat in inet_twsk_deschedule()

This is caused by inet_twsk_purge(), run from process context,
and commit 575f4cd5a5b6394577 (net: Use rcu lookups in inet_twsk_purge.)
removed the BH disabling that was necessary.

Add the BH disabling but fine grained, right before calling
inet_twsk_deschedule(), instead of whole function.

With help from Linus Torvalds and Eric W. Biederman

Reported-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Daniel Lezcano <daniel.lezcano@free.fr>
CC: Pavel Emelyanov <xemul@openvz.org>
CC: Arnaldo Carvalho de Melo <acme@redhat.com>
CC: stable <stable@kernel.org> (# 2.6.33+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_timewait_sock.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c5af909cf701..3c8dfa16614d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -505,7 +505,9 @@ restart:
 			}
 
 			rcu_read_unlock();
+			local_bh_disable();
 			inet_twsk_deschedule(tw, twdr);
+			local_bh_enable();
 			inet_twsk_put(tw);
 			goto restart_rcu;
 		}
-- 
cgit v1.2.2


From 2205a6ea93fea76f88b43727fea53f3ce3790d6f Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 17 Feb 2011 13:12:08 +0000
Subject: sctp: fix reporting of unknown parameters

commit 5fa782c2f5ef6c2e4f04d3e228412c9b4a4c8809 re-worked the
handling of unknown parameters. sctp_init_cause_fixed() can now
return -ENOSPC if there is not enough tailroom in the error
chunk skb. When this happens, the error header is not appended to
the error chunk. In that case, the payload of the unknown parameter
should not be appended either.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 2cc46f0962ca..b23428f3c0dd 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2029,11 +2029,11 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
 			*errp = sctp_make_op_error_fixed(asoc, chunk);
 
 		if (*errp) {
-			sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
-					WORD_ROUND(ntohs(param.p->length)));
-			sctp_addto_chunk_fixed(*errp,
-					WORD_ROUND(ntohs(param.p->length)),
-					param.v);
+			if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+					WORD_ROUND(ntohs(param.p->length))))
+				sctp_addto_chunk_fixed(*errp,
+						WORD_ROUND(ntohs(param.p->length)),
+						param.v);
 		} else {
 			/* If there is no memory for generating the ERROR
 			 * report as specified, an ABORT will be triggered
-- 
cgit v1.2.2


From 5f04d5068a90602b93a7953e9a47c496705c6976 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 20 Feb 2011 11:49:45 -0800
Subject: net: Fix more stale on-stack list_head objects.

From: Eric W. Biederman <ebiederm@xmission.com>

In the beginning with batching unreg_list was a list that was used only
once in the lifetime of a network device (I think).  Now we have calls
using the unreg_list that can happen multiple times in the life of a
network device like dev_deactivate and dev_close that are also using the
unreg_list.  In addition in unregister_netdevice_queue we also do a
list_move because for devices like veth pairs it is possible that
unregister_netdevice_queue will be called multiple times.

So I think the change below to fix dev_deactivate which Eric D. missed
will fix this problem.  Now to go test that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mac80211/iface.c    | 1 +
 net/sched/sch_generic.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 8acba456744e..7a10a8d1b2d0 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1229,6 +1229,7 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
 	}
 	mutex_unlock(&local->iflist_mtx);
 	unregister_netdevice_many(&unreg_list);
+	list_del(&unreg_list);
 }
 
 static u32 ieee80211_idle_off(struct ieee80211_local *local,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598440a2..1bc698039ae2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -839,6 +839,7 @@ void dev_deactivate(struct net_device *dev)
 
 	list_add(&dev->unreg_list, &single);
 	dev_deactivate_many(&single);
+	list_del(&single);
 }
 
 static void dev_init_scheduler_queue(struct net_device *dev,
-- 
cgit v1.2.2


From c24f691b56107feeba076616982093ee2d3c8fb5 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 7 Feb 2011 12:57:04 +0000
Subject: tcp: undo_retrans counter fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a bug that undo_retrans is incorrectly decremented when undo_marker is
not set or undo_retrans is already 0. This happens when sender receives
more DSACK ACKs than packets retransmitted during the current
undo phase. This may also happen when sender receives DSACK after
the undo operation is completed or cancelled.

Fix another bug that undo_retrans is incorrectly incremented when
sender retransmits an skb and tcp_skb_pcount(skb) > 1 (TSO). This case
is rare but not impossible.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c  | 5 +++--
 net/ipv4/tcp_output.c | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eb7f82ebf4a3..65f6c0406245 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1222,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
 	}
 
 	/* D-SACK for already forgotten data... Do dumb counting. */
-	if (dup_sack &&
+	if (dup_sack && tp->undo_marker && tp->undo_retrans &&
 	    !after(end_seq_0, prior_snd_una) &&
 	    after(end_seq_0, tp->undo_marker))
 		tp->undo_retrans--;
@@ -1299,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 
 	/* Account D-SACK for retransmitted packet. */
 	if (dup_sack && (sacked & TCPCB_RETRANS)) {
-		if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+		if (tp->undo_marker && tp->undo_retrans &&
+		    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
 			tp->undo_retrans--;
 		if (sacked & TCPCB_SACKED_ACKED)
 			state->reord = min(fack_count, state->reord);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 406f320336e6..dfa5beb0c1c8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2162,7 +2162,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (!tp->retrans_stamp)
 			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
 
-		tp->undo_retrans++;
+		tp->undo_retrans += tcp_skb_pcount(skb);
 
 		/* snd_nxt is stored to detect loss of retransmitted segment,
 		 * see tcp_input.c tcp_sacktag_write_queue().
-- 
cgit v1.2.2


From 4f919a3bc54da01db829c520ce4b1fabfde1c3f7 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Tue, 22 Feb 2011 00:11:06 +0800
Subject: fix cfg80211_wext_siwfreq lock ordering...

I previously managed to reproduce a hang while scanning wireless
channels (reproducible with airodump-ng hopping channels); subsequent
lockdep instrumentation revealed a lock ordering issue.

Without knowing the design intent, it looks like the locks should be
taken in reverse order; please comment.

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.38-rc5-341cd #4
-------------------------------------------------------
airodump-ng/15445 is trying to acquire lock:
 (&rdev->devlist_mtx){+.+.+.}, at: [<ffffffff816b1266>]
cfg80211_wext_siwfreq+0xc6/0x100

but task is already holding lock:
 (&wdev->mtx){+.+.+.}, at: [<ffffffff816b125c>] cfg80211_wext_siwfreq+0xbc/0x100

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&wdev->mtx){+.+.+.}:
       [<ffffffff810a79d6>] lock_acquire+0xc6/0x280
       [<ffffffff816d6bce>] mutex_lock_nested+0x6e/0x4b0
       [<ffffffff81696080>] cfg80211_netdev_notifier_call+0x430/0x5f0
       [<ffffffff8109351b>] notifier_call_chain+0x8b/0x100
       [<ffffffff810935b1>] raw_notifier_call_chain+0x11/0x20
       [<ffffffff81576d92>] call_netdevice_notifiers+0x32/0x60
       [<ffffffff815771a4>] __dev_notify_flags+0x34/0x80
       [<ffffffff81577230>] dev_change_flags+0x40/0x70
       [<ffffffff8158587c>] do_setlink+0x1fc/0x8d0
       [<ffffffff81586042>] rtnl_setlink+0xf2/0x140
       [<ffffffff81586923>] rtnetlink_rcv_msg+0x163/0x270
       [<ffffffff8159d741>] netlink_rcv_skb+0xa1/0xd0
       [<ffffffff815867b0>] rtnetlink_rcv+0x20/0x30
       [<ffffffff8159d39a>] netlink_unicast+0x2ba/0x300
       [<ffffffff8159dd57>] netlink_sendmsg+0x267/0x3e0
       [<ffffffff8155e364>] sock_sendmsg+0xe4/0x110
       [<ffffffff8155f3a3>] sys_sendmsg+0x253/0x3b0
       [<ffffffff81003192>] system_call_fastpath+0x16/0x1b

-> #0 (&rdev->devlist_mtx){+.+.+.}:
       [<ffffffff810a7222>] __lock_acquire+0x1622/0x1d10
       [<ffffffff810a79d6>] lock_acquire+0xc6/0x280
       [<ffffffff816d6bce>] mutex_lock_nested+0x6e/0x4b0
       [<ffffffff816b1266>] cfg80211_wext_siwfreq+0xc6/0x100
       [<ffffffff816b2fad>] ioctl_standard_call+0x5d/0xd0
       [<ffffffff816b3223>] T.808+0x163/0x170
       [<ffffffff816b326a>] wext_handle_ioctl+0x3a/0x90
       [<ffffffff815798d2>] dev_ioctl+0x6f2/0x830
       [<ffffffff8155cf3d>] sock_ioctl+0xfd/0x290
       [<ffffffff8117dffd>] do_vfs_ioctl+0x9d/0x590
       [<ffffffff8117e53a>] sys_ioctl+0x4a/0x80
       [<ffffffff81003192>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

2 locks held by airodump-ng/15445:
 #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff81586782>] rtnl_lock+0x12/0x20
 #1:  (&wdev->mtx){+.+.+.}, at: [<ffffffff816b125c>]
cfg80211_wext_siwfreq+0xbc/0x100

stack backtrace:
Pid: 15445, comm: airodump-ng Not tainted 2.6.38-rc5-341cd #4
Call Trace:
 [<ffffffff810a3f0a>] ? print_circular_bug+0xfa/0x100
 [<ffffffff810a7222>] ? __lock_acquire+0x1622/0x1d10
 [<ffffffff810a1f99>] ? trace_hardirqs_off_caller+0x29/0xc0
 [<ffffffff810a79d6>] ? lock_acquire+0xc6/0x280
 [<ffffffff816b1266>] ? cfg80211_wext_siwfreq+0xc6/0x100
 [<ffffffff810a31d7>] ? mark_held_locks+0x67/0x90
 [<ffffffff816d6bce>] ? mutex_lock_nested+0x6e/0x4b0
 [<ffffffff816b1266>] ? cfg80211_wext_siwfreq+0xc6/0x100
 [<ffffffff810a31d7>] ? mark_held_locks+0x67/0x90
 [<ffffffff816b1266>] ? cfg80211_wext_siwfreq+0xc6/0x100
 [<ffffffff816b1266>] ? cfg80211_wext_siwfreq+0xc6/0x100
 [<ffffffff816b2fad>] ? ioctl_standard_call+0x5d/0xd0
 [<ffffffff8157818b>] ? __dev_get_by_name+0x9b/0xc0
 [<ffffffff816b2f50>] ? ioctl_standard_call+0x0/0xd0
 [<ffffffff816b3223>] ? T.808+0x163/0x170
 [<ffffffff8112ddf2>] ? might_fault+0x72/0xd0
 [<ffffffff816b326a>] ? wext_handle_ioctl+0x3a/0x90
 [<ffffffff8112de3b>] ? might_fault+0xbb/0xd0
 [<ffffffff815798d2>] ? dev_ioctl+0x6f2/0x830
 [<ffffffff810a1bae>] ? put_lock_stats+0xe/0x40
 [<ffffffff810a1c8c>] ? lock_release_holdtime+0xac/0x150
 [<ffffffff8155cf3d>] ? sock_ioctl+0xfd/0x290
 [<ffffffff8117dffd>] ? do_vfs_ioctl+0x9d/0x590
 [<ffffffff8116c8ff>] ? fget_light+0x1df/0x3c0
 [<ffffffff8117e53a>] ? sys_ioctl+0x4a/0x80
 [<ffffffff81003192>] ? system_call_fastpath+0x16/0x1b

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/wext-compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 3e5dbd4e4cd5..d112f038edf0 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -802,11 +802,11 @@ int cfg80211_wext_siwfreq(struct net_device *dev,
 			return freq;
 		if (freq == 0)
 			return -EINVAL;
-		wdev_lock(wdev);
 		mutex_lock(&rdev->devlist_mtx);
+		wdev_lock(wdev);
 		err = cfg80211_set_freq(rdev, wdev, freq, NL80211_CHAN_NO_HT);
-		mutex_unlock(&rdev->devlist_mtx);
 		wdev_unlock(wdev);
+		mutex_unlock(&rdev->devlist_mtx);
 		return err;
 	default:
 		return -EOPNOTSUPP;
-- 
cgit v1.2.2


From 9cc6e0c4c457f84bedcfb04e7dd58a36909c4ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Tue, 15 Feb 2011 13:19:17 +0000
Subject: bridge: Fix IPv6 multicast snooping by storing correct protocol type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The protocol type for IPv6 entries in the hash table for multicast
bridge snooping is falsely set to ETH_P_IP, marking it as an IPv4
address, instead of setting it to ETH_P_IPV6, which results in negative
look-ups in the hash table later.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 09d5c0987925..17708fccf1ee 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -784,7 +784,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
 		return 0;
 
 	ipv6_addr_copy(&br_group.u.ip6, group);
-	br_group.proto = htons(ETH_P_IP);
+	br_group.proto = htons(ETH_P_IPV6);
 
 	return br_multicast_add_group(br, port, &br_group);
 }
-- 
cgit v1.2.2


From 649e984d00416cb1a254fdbebd6d3f9fa01c32fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Tue, 15 Feb 2011 13:19:18 +0000
Subject: bridge: Fix IPv6 multicast snooping by correcting offset in MLDv2
 report
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We actually want a pointer to the grec_nsrcr and not the following
field. Otherwise we can get very high values for *nsrcs as the first two
bytes of the IPv6 multicast address are being used instead, leading to
a failing pskb_may_pull() which results in MLDv2 reports not being
parsed.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 17708fccf1ee..d69beaf83627 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1013,7 +1013,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 
 		nsrcs = skb_header_pointer(skb,
 					   len + offsetof(struct mld2_grec,
-							  grec_mca),
+							  grec_nsrcs),
 					   sizeof(_nsrcs), &_nsrcs);
 		if (!nsrcs)
 			return -EINVAL;
-- 
cgit v1.2.2


From d41db9f3f71548f07b8b6d81a88220d0035b04f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Tue, 15 Feb 2011 13:19:19 +0000
Subject: bridge: Add missing ntohs()s for MLDv2 report parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nsrcs number is 2 Byte wide, therefore we need to call ntohs()
before using it.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d69beaf83627..9ce2af187709 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1020,11 +1020,12 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 
 		if (!pskb_may_pull(skb,
 				   len + sizeof(*grec) +
-				   sizeof(struct in6_addr) * (*nsrcs)))
+				   sizeof(struct in6_addr) * ntohs(*nsrcs)))
 			return -EINVAL;
 
 		grec = (struct mld2_grec *)(skb->data + len);
-		len += sizeof(*grec) + sizeof(struct in6_addr) * (*nsrcs);
+		len += sizeof(*grec) +
+		       sizeof(struct in6_addr) * ntohs(*nsrcs);
 
 		/* We treat these as MLDv1 reports for now. */
 		switch (grec->grec_type) {
-- 
cgit v1.2.2


From e4de9f9e8333fbbae951c6e068f501f955123cf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Tue, 15 Feb 2011 13:19:21 +0000
Subject: bridge: Allow mcast snooping for transient link local addresses too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the multicast bridge snooping support is not active for
link local multicast. I assume this has been done to leave
important multicast data untouched, like IPv6 Neighborhood Discovery.

In larger, bridged, local networks it could however be desirable to
optimize for instance local multicast audio/video streaming too.

With the transient flag in IPv6 multicast addresses we have an easy
way to optimize such multimedia traffic without tempering with the
high priority multicast data from well-known addresses.

This patch alters the multicast bridge snooping for IPv6, to take
effect for transient multicast addresses instead of non-link-local
addresses.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 9ce2af187709..1207a5a0688a 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -37,10 +37,9 @@
 	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static inline int ipv6_is_local_multicast(const struct in6_addr *addr)
+static inline int ipv6_is_transient_multicast(const struct in6_addr *addr)
 {
-	if (ipv6_addr_is_multicast(addr) &&
-	    IPV6_ADDR_MC_SCOPE(addr) <= IPV6_ADDR_SCOPE_LINKLOCAL)
+	if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr))
 		return 1;
 	return 0;
 }
@@ -780,7 +779,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
 {
 	struct br_ip br_group;
 
-	if (ipv6_is_local_multicast(group))
+	if (!ipv6_is_transient_multicast(group))
 		return 0;
 
 	ipv6_addr_copy(&br_group.u.ip6, group);
@@ -1341,7 +1340,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
 {
 	struct br_ip br_group;
 
-	if (ipv6_is_local_multicast(group))
+	if (!ipv6_is_transient_multicast(group))
 		return;
 
 	ipv6_addr_copy(&br_group.u.ip6, group);
-- 
cgit v1.2.2


From 36cff5a10c6b003fa2d0464848d5664b2bf723e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Thu, 17 Feb 2011 08:17:51 +0000
Subject: bridge: Fix MLD queries' ethernet source address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Map the IPv6 header's destination multicast address to an ethernet
source address instead of the MLD queries multicast address.

For instance for a general MLD query (multicast address in the MLD query
set to ::), this would wrongly be mapped to 33:33:00:00:00:00, although
an MLD queries destination MAC should always be 33:33:00:00:00:01 which
matches the IPv6 header's multicast destination ff02::1.

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 1207a5a0688a..c1f24e4d0820 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -434,7 +434,6 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
 	eth = eth_hdr(skb);
 
 	memcpy(eth->h_source, br->dev->dev_addr, 6);
-	ipv6_eth_mc_map(group, eth->h_dest);
 	eth->h_proto = htons(ETH_P_IPV6);
 	skb_put(skb, sizeof(*eth));
 
@@ -448,6 +447,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
 	ip6h->hop_limit = 1;
 	ipv6_addr_set(&ip6h->saddr, 0, 0, 0, 0);
 	ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
+	ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
 
 	hopopt = (u8 *)(ip6h + 1);
 	hopopt[0] = IPPROTO_ICMPV6;		/* next hdr */
-- 
cgit v1.2.2


From fe29ec41aaa51902aebd63658dfb04fe6fea8be5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Thu, 17 Feb 2011 08:17:52 +0000
Subject: bridge: Use IPv6 link-local address for multicast listener queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the bridge multicast snooping feature periodically issues
IPv6 general multicast listener queries to sense the absence of a
listener.

For this, it uses :: as its source address - however RFC 2710 requires:
"To be valid, the Query message MUST come from a link-local IPv6 Source
Address". Current Linux kernel versions seem to follow this requirement
and ignore our bogus MLD queries.

With this commit a link local address from the bridge interface is being
used to issue the MLD query, resulting in other Linux devices which are
multicast listeners in the network to respond with a MLD response (which
was not the case before).

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index c1f24e4d0820..030a002ff8ee 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -445,7 +445,8 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
 	ip6h->payload_len = htons(8 + sizeof(*mldq));
 	ip6h->nexthdr = IPPROTO_HOPOPTS;
 	ip6h->hop_limit = 1;
-	ipv6_addr_set(&ip6h->saddr, 0, 0, 0, 0);
+	ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
+			   &ip6h->saddr);
 	ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
 	ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
 
-- 
cgit v1.2.2


From c486da34390846b430896a407b47f0cea3a4189c Mon Sep 17 00:00:00 2001
From: Lucian Adrian Grijincu <lucian.grijincu@gmail.com>
Date: Thu, 24 Feb 2011 19:48:03 +0000
Subject: sysctl: ipv6: use correct net in ipv6_sysctl_rtcache_flush

Before this patch issuing these commands:

  fd = open("/proc/sys/net/ipv6/route/flush")
  unshare(CLONE_NEWNET)
  write(fd, "stuff")

would flush the newly created net, not the original one.

The equivalent ipv4 code is correct (stores the net inside ->extra1).
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a998db6e7895..904312e25a3c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2557,14 +2557,16 @@ static
 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
 			      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct net *net = current->nsproxy->net_ns;
-	int delay = net->ipv6.sysctl.flush_delay;
-	if (write) {
-		proc_dointvec(ctl, write, buffer, lenp, ppos);
-		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
-		return 0;
-	} else
+	struct net *net;
+	int delay;
+	if (!write)
 		return -EINVAL;
+
+	net = (struct net *)ctl->extra1;
+	delay = net->ipv6.sysctl.flush_delay;
+	proc_dointvec(ctl, write, buffer, lenp, ppos);
+	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
+	return 0;
 }
 
 ctl_table ipv6_route_table_template[] = {
@@ -2651,6 +2653,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
 
 	if (table) {
 		table[0].data = &net->ipv6.sysctl.flush_delay;
+		table[0].extra1 = net;
 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
-- 
cgit v1.2.2


From 0a93ea2e897bd793cc0aaaddc397eff32ac8d6fe Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Fri, 25 Feb 2011 15:33:17 +0000
Subject: RxRPC: Allocate tokens with kzalloc to avoid oops in rxrpc_destroy

With slab poisoning enabled, I see the following oops:

  Unable to handle kernel paging request for data at address 0x6b6b6b6b6b6b6b73
  ...
  NIP [c0000000006bc61c] .rxrpc_destroy+0x44/0x104
  LR [c0000000006bc618] .rxrpc_destroy+0x40/0x104
  Call Trace:
  [c0000000feb2bc00] [c0000000006bc618] .rxrpc_destroy+0x40/0x104 (unreliable)
  [c0000000feb2bc90] [c000000000349b2c] .key_cleanup+0x1a8/0x20c
  [c0000000feb2bd40] [c0000000000a2920] .process_one_work+0x2f4/0x4d0
  [c0000000feb2be00] [c0000000000a2d50] .worker_thread+0x254/0x468
  [c0000000feb2bec0] [c0000000000a868c] .kthread+0xbc/0xc8
  [c0000000feb2bf90] [c000000000020e00] .kernel_thread+0x54/0x70

We aren't initialising token->next, but the code in destroy_context relies
on the list being NULL terminated. Use kzalloc to zero out all the fields.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/rxrpc/ar-key.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 5ee16f0353fe..d763793d39de 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -89,11 +89,11 @@ static int rxrpc_instantiate_xdr_rxkad(struct key *key, const __be32 *xdr,
 		return ret;
 
 	plen -= sizeof(*token);
-	token = kmalloc(sizeof(*token), GFP_KERNEL);
+	token = kzalloc(sizeof(*token), GFP_KERNEL);
 	if (!token)
 		return -ENOMEM;
 
-	token->kad = kmalloc(plen, GFP_KERNEL);
+	token->kad = kzalloc(plen, GFP_KERNEL);
 	if (!token->kad) {
 		kfree(token);
 		return -ENOMEM;
@@ -731,10 +731,10 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen)
 		goto error;
 
 	ret = -ENOMEM;
-	token = kmalloc(sizeof(*token), GFP_KERNEL);
+	token = kzalloc(sizeof(*token), GFP_KERNEL);
 	if (!token)
 		goto error;
-	token->kad = kmalloc(plen, GFP_KERNEL);
+	token->kad = kzalloc(plen, GFP_KERNEL);
 	if (!token->kad)
 		goto error_free;
 
-- 
cgit v1.2.2


From 5aca1a9e880e06bb7e5fd553a86a330ae7e218b5 Mon Sep 17 00:00:00 2001
From: Hagen Paul Pfeifer <hagen@jauu.net>
Date: Fri, 25 Feb 2011 13:58:54 -0800
Subject: net: handle addr_type of 0 properly

addr_type of 0 means that the type should be adopted from from_dev and
not from __hw_addr_del_multiple(). Unfortunately it isn't so and
addr_type will always be considered. Fix this by implementing the
considered and documented behavior.

Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev_addr_lists.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 508f9c18992f..133fd22ea287 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -144,7 +144,7 @@ void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
 
 	list_for_each_entry(ha, &from_list->list, list) {
 		type = addr_type ? addr_type : ha->type;
-		__hw_addr_del(to_list, ha->addr, addr_len, addr_type);
+		__hw_addr_del(to_list, ha->addr, addr_len, type);
 	}
 }
 EXPORT_SYMBOL(__hw_addr_del_multiple);
-- 
cgit v1.2.2


From b44d211e166b4b0dae8ce379f9d2e3ac164b5b60 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Mon, 21 Feb 2011 02:40:47 +0000
Subject: netlink: handle errors from netlink_dump()

netlink_dump() may failed, but nobody handle its error.
It generates output data, when a previous portion has been returned to
user space. This mechanism works when all data isn't go in skb. If we
enter in netlink_recvmsg() and skb is absent in the recv queue, the
netlink_dump() will not been executed. So if netlink_dump() is failed
one time, the new data never appear and the reader will sleep forever.

netlink_dump() is called from two places:

1. from netlink_sendmsg->...->netlink_dump_start().
   In this place we can report error directly and it will be returned
   by sendmsg().

2. from netlink_recvmsg
   There we can't report error directly, because we have a portion of
   valid output data and call netlink_dump() for prepare the next portion.
   If netlink_dump() is failed, the socket will be mark as error and the
   next recvmsg will be failed.

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 478181d53c55..1f924595bdef 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1407,7 +1407,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	int noblock = flags&MSG_DONTWAIT;
 	size_t copied;
 	struct sk_buff *skb, *data_skb;
-	int err;
+	int err, ret;
 
 	if (flags&MSG_OOB)
 		return -EOPNOTSUPP;
@@ -1470,8 +1470,13 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 
 	skb_free_datagram(sk, skb);
 
-	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
-		netlink_dump(sk);
+	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
+		ret = netlink_dump(sk);
+		if (ret) {
+			sk->sk_err = ret;
+			sk->sk_error_report(sk);
+		}
+	}
 
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
@@ -1736,6 +1741,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	struct netlink_callback *cb;
 	struct sock *sk;
 	struct netlink_sock *nlk;
+	int ret;
 
 	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
 	if (cb == NULL)
@@ -1764,9 +1770,13 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	nlk->cb = cb;
 	mutex_unlock(nlk->cb_mutex);
 
-	netlink_dump(sk);
+	ret = netlink_dump(sk);
+
 	sock_put(sk);
 
+	if (ret)
+		return ret;
+
 	/* We successfully started a dump, by returning -EINTR we
 	 * signal not to send ACK even if it was requested.
 	 */
-- 
cgit v1.2.2


From ff75f40f44ae9b79d520bf32a05d35af74a805c0 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 22 Feb 2011 10:40:25 +0200
Subject: ipvs: fix dst_lock locking on dest update

	Fix dst_lock usage in __ip_vs_update_dest. We need
_bh locking because destination is updated in user context.
Can cause lockups on frequent destination updates.
Problem reported by Simon Kirby. Bug was introduced
in 2.6.37 from the "ipvs: changes for local real server"
change.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 22f7ad5101ab..ba98e1308f3c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -808,9 +808,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	dest->u_threshold = udest->u_threshold;
 	dest->l_threshold = udest->l_threshold;
 
-	spin_lock(&dest->dst_lock);
+	spin_lock_bh(&dest->dst_lock);
 	ip_vs_dst_reset(dest);
-	spin_unlock(&dest->dst_lock);
+	spin_unlock_bh(&dest->dst_lock);
 
 	if (add)
 		ip_vs_new_estimator(&dest->stats);
-- 
cgit v1.2.2


From 720dc34bbbe9493c7bd48b2243058b4e447a929d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 1 Mar 2011 23:02:07 -0800
Subject: dccp: fix oops on Reset after close

This fixes a bug in the order of dccp_rcv_state_process() that still permitted
reception even after closing the socket. A Reset after close thus causes a NULL
pointer dereference by not preventing operations on an already torn-down socket.

 dccp_v4_do_rcv()
	|
	| state other than OPEN
	v
 dccp_rcv_state_process()
	|
	| DCCP_PKT_RESET
	v
 dccp_rcv_reset()
	|
	v
 dccp_time_wait()

 WARNING: at net/ipv4/inet_timewait_sock.c:141 __inet_twsk_hashdance+0x48/0x128()
 Modules linked in: arc4 ecb carl9170 rt2870sta(C) mac80211 r8712u(C) crc_ccitt ah
 [<c0038850>] (unwind_backtrace+0x0/0xec) from [<c0055364>] (warn_slowpath_common)
 [<c0055364>] (warn_slowpath_common+0x4c/0x64) from [<c0055398>] (warn_slowpath_n)
 [<c0055398>] (warn_slowpath_null+0x1c/0x24) from [<c02b72d0>] (__inet_twsk_hashd)
 [<c02b72d0>] (__inet_twsk_hashdance+0x48/0x128) from [<c031caa0>] (dccp_time_wai)
 [<c031caa0>] (dccp_time_wait+0x40/0xc8) from [<c031c15c>] (dccp_rcv_state_proces)
 [<c031c15c>] (dccp_rcv_state_process+0x120/0x538) from [<c032609c>] (dccp_v4_do_)
 [<c032609c>] (dccp_v4_do_rcv+0x11c/0x14c) from [<c0286594>] (release_sock+0xac/0)
 [<c0286594>] (release_sock+0xac/0x110) from [<c031fd34>] (dccp_close+0x28c/0x380)
 [<c031fd34>] (dccp_close+0x28c/0x380) from [<c02d9a78>] (inet_release+0x64/0x70)

The fix is by testing the socket state first. Receiving a packet in Closed state
now also produces the required "No connection" Reset reply of RFC 4340, 8.3.1.

Reported-and-tested-by: Johan Hovold <jhovold@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 8cde009e8b85..4222e7a654b0 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -614,6 +614,9 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* Caller (dccp_v4_do_rcv) will send Reset */
 		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
 		return 1;
+	} else if (sk->sk_state == DCCP_CLOSED) {
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+		return 1;
 	}
 
 	if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) {
@@ -668,10 +671,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	}
 
 	switch (sk->sk_state) {
-	case DCCP_CLOSED:
-		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
-		return 1;
-
 	case DCCP_REQUESTING:
 		queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
 		if (queued >= 0)
-- 
cgit v1.2.2


From 9ef0298a8e5730d9a46d640014c727f3b4152870 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 2 Mar 2011 12:10:13 +0100
Subject: netfilter: nf_log: avoid oops in (un)bind with invalid nfproto values

Like many other places, we have to check that the array index is
within allowed limits, or otherwise, a kernel oops and other nastiness
can ensue when we access memory beyond the end of the array.

[ 5954.115381] BUG: unable to handle kernel paging request at 0000004000000000
[ 5954.120014] IP:  __find_logger+0x6f/0xa0
[ 5954.123979]  nf_log_bind_pf+0x2b/0x70
[ 5954.123979]  nfulnl_recv_config+0xc0/0x4a0 [nfnetlink_log]
[ 5954.123979]  nfnetlink_rcv_msg+0x12c/0x1b0 [nfnetlink]
...

The problem goes back to v2.6.30-rc1~1372~1342~31 where nf_log_bind
was decoupled from nf_log_register.

Reported-by: Miguel Di Ciurcio Filho <miguel.filho@gmail.com>,
  via irc.freenode.net/#netfilter
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_log.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index b07393eab88e..91816998ed86 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -85,6 +85,8 @@ EXPORT_SYMBOL(nf_log_unregister);
 
 int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger)
 {
+	if (pf >= ARRAY_SIZE(nf_loggers))
+		return -EINVAL;
 	mutex_lock(&nf_log_mutex);
 	if (__find_logger(pf, logger->name) == NULL) {
 		mutex_unlock(&nf_log_mutex);
@@ -98,6 +100,8 @@ EXPORT_SYMBOL(nf_log_bind_pf);
 
 void nf_log_unbind_pf(u_int8_t pf)
 {
+	if (pf >= ARRAY_SIZE(nf_loggers))
+		return;
 	mutex_lock(&nf_log_mutex);
 	rcu_assign_pointer(nf_loggers[pf], NULL);
 	mutex_unlock(&nf_log_mutex);
-- 
cgit v1.2.2


From f3d7bc57c71eba3f279785111bb473b1ef68dcb6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Wed, 2 Mar 2011 10:35:33 +0000
Subject: net: dcbnl: check correct ops in dcbnl_ieee_set()

The incorrect ops routine was being tested for in
DCB_ATTR_IEEE_PFC attributes. This patch corrects
it.

Currently, every driver implementing ieee_setets also
implements ieee_setpfc so this bug is not actualized
yet.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dcb/dcbnl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index d5074a567289..c44348adba3b 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1193,7 +1193,7 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb,
 			goto err;
 	}
 
-	if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setets) {
+	if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
 		struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
 		err = ops->ieee_setpfc(netdev, pfc);
 		if (err)
-- 
cgit v1.2.2


From 10003453479ef287a73f8a39593f8f42687ea565 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 28 Feb 2011 03:27:43 +0000
Subject: AF_RXRPC: Handle receiving ACKALL packets

The OpenAFS server is now sending ACKALL packets, so we need to handle them.
Otherwise we report a protocol error and abort.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/ar-input.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 89315009bab1..1a2b0633fece 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -423,6 +423,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
 			goto protocol_error;
 		}
 
+	case RXRPC_PACKET_TYPE_ACKALL:
 	case RXRPC_PACKET_TYPE_ACK:
 		/* ACK processing is done in process context */
 		read_lock_bh(&call->state_lock);
-- 
cgit v1.2.2


From 38815b780285a4957852c5c9dbe94991c0b26c56 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 2 Mar 2011 16:55:21 -0800
Subject: libceph: fix handling of short returns from get_user_pages

get_user_pages() can return fewer pages than we ask for.  We were returning
a bogus pointer/error code in that case.  Instead, loop until we get all
the pages we want or get an error we can return to the caller.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/pagevec.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 1a040e64c69f..cd9c21df87d1 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -16,22 +16,30 @@ struct page **ceph_get_direct_page_vector(const char __user *data,
 					  int num_pages, bool write_page)
 {
 	struct page **pages;
-	int rc;
+	int got = 0;
+	int rc = 0;
 
 	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 
 	down_read(&current->mm->mmap_sem);
-	rc = get_user_pages(current, current->mm, (unsigned long)data,
-			    num_pages, write_page, 0, pages, NULL);
+	while (got < num_pages) {
+		rc = get_user_pages(current, current->mm,
+		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
+		    num_pages - got, write_page, 0, pages + got, NULL);
+		if (rc < 0)
+			break;
+		BUG_ON(rc == 0);
+		got += rc;
+	}
 	up_read(&current->mm->mmap_sem);
-	if (rc < num_pages)
+	if (rc < 0)
 		goto fail;
 	return pages;
 
 fail:
-	ceph_put_page_vector(pages, rc > 0 ? rc : 0, false);
+	ceph_put_page_vector(pages, got, false);
 	return ERR_PTR(rc);
 }
 EXPORT_SYMBOL(ceph_get_direct_page_vector);
-- 
cgit v1.2.2


From 692d20f576fb26f62c83f80dbf3ea899998391b7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Mar 2011 12:14:53 -0800
Subject: libceph: retry after authorization failure

If we mark the connection CLOSED we will give up trying to reconnect to
this server instance.  That is appropriate for things like a protocol
version mismatch that won't change until the server is restarted, at which
point we'll get a new addr and reconnect.  An authorization failure like
this is probably due to the server not properly rotating it's secret keys,
however, and should be treated as transient so that the normal backoff and
retry behavior kicks in.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 35b36b86d762..6bd5025f6220 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1248,8 +1248,6 @@ static int process_connect(struct ceph_connection *con)
 		     con->auth_retry);
 		if (con->auth_retry == 2) {
 			con->error_msg = "connect authorization failure";
-			reset_connection(con);
-			set_bit(CLOSED, &con->state);
 			return -1;
 		}
 		con->auth_retry = 1;
-- 
cgit v1.2.2


From 1362fa078dae16776cd439791c6605b224ea6171 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 3 Mar 2011 11:28:58 +0000
Subject: DNS: Fix a NULL pointer deref when trying to read an error key
 [CVE-2011-1076]

When a DNS resolver key is instantiated with an error indication, attempts to
read that key will result in an oops because user_read() is expecting there to
be a payload - and there isn't one [CVE-2011-1076].

Give the DNS resolver key its own read handler that returns the error cached in
key->type_data.x[0] as an error rather than crashing.

Also make the kenter() at the beginning of dns_resolver_instantiate() limit the
amount of data it prints, since the data is not necessarily NUL-terminated.

The buggy code was added in:

	commit 4a2d789267e00b5a1175ecd2ddefcc78b83fbf09
	Author: Wang Lei <wang840925@gmail.com>
	Date:   Wed Aug 11 09:37:58 2010 +0100
	Subject: DNS: If the DNS server returns an error, allow that to be cached [ver #2]

This can trivially be reproduced by any user with the following program
compiled with -lkeyutils:

	#include <stdlib.h>
	#include <keyutils.h>
	#include <err.h>
	static char payload[] = "#dnserror=6";
	int main()
	{
		key_serial_t key;
		key = add_key("dns_resolver", "a", payload, sizeof(payload),
			      KEY_SPEC_SESSION_KEYRING);
		if (key == -1)
			err(1, "add_key");
		if (keyctl_read(key, NULL, 0) == -1)
			err(1, "read_key");
		return 0;
	}

What should happen is that keyctl_read() reports error 6 (ENXIO) to the user:

	dns-break: read_key: No such device or address

but instead the kernel oopses.

This cannot be reproduced with the 'keyutils add' or 'keyutils padd' commands
as both of those cut the data down below the NUL termination that must be
included in the data.  Without this dns_resolver_instantiate() will return
-EINVAL and the key will not be instantiated such that it can be read.

The oops looks like:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
IP: [<ffffffff811b99f7>] user_read+0x4f/0x8f
PGD 3bdf8067 PUD 385b9067 PMD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:00/0000:00:19.0/irq
CPU 0
Modules linked in:

Pid: 2150, comm: dns-break Not tainted 2.6.38-rc7-cachefs+ #468                  /DG965RY
RIP: 0010:[<ffffffff811b99f7>]  [<ffffffff811b99f7>] user_read+0x4f/0x8f
RSP: 0018:ffff88003bf47f08  EFLAGS: 00010246
RAX: 0000000000000001 RBX: ffff88003b5ea378 RCX: ffffffff81972368
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88003b5ea378
RBP: ffff88003bf47f28 R08: ffff88003be56620 R09: 0000000000000000
R10: 0000000000000395 R11: 0000000000000002 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: ffffffffffffffa1
FS:  00007feab5751700(0000) GS:ffff88003e000000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000010 CR3: 000000003de40000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process dns-break (pid: 2150, threadinfo ffff88003bf46000, task ffff88003be56090)
Stack:
 ffff88003b5ea378 ffff88003b5ea3a0 0000000000000000 0000000000000000
 ffff88003bf47f68 ffffffff811b708e ffff88003c442bc8 0000000000000000
 00000000004005a0 00007fffba368060 0000000000000000 0000000000000000
Call Trace:
 [<ffffffff811b708e>] keyctl_read_key+0xac/0xcf
 [<ffffffff811b7c07>] sys_keyctl+0x75/0xb6
 [<ffffffff81001f7b>] system_call_fastpath+0x16/0x1b
Code: 75 1f 48 83 7b 28 00 75 18 c6 05 58 2b fb 00 01 be bb 00 00 00 48 c7 c7 76 1c 75 81 e8 13 c2 e9 ff 4c 8b b3 e0 00 00 00 4d 85 ed <41> 0f b7 5e 10 74 2d 4d 85 e4 74 28 e8 98 79 ee ff 49 39 dd 48
RIP  [<ffffffff811b99f7>] user_read+0x4f/0x8f
 RSP <ffff88003bf47f08>
CR2: 0000000000000010

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
cc: Wang Lei <wang840925@gmail.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 net/dns_resolver/dns_key.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 739435a6af39..cfa7a5e1c5c9 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -67,8 +67,9 @@ dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
 	size_t result_len = 0;
 	const char *data = _data, *end, *opt;
 
-	kenter("%%%d,%s,'%s',%zu",
-	       key->serial, key->description, data, datalen);
+	kenter("%%%d,%s,'%*.*s',%zu",
+	       key->serial, key->description,
+	       (int)datalen, (int)datalen, data, datalen);
 
 	if (datalen <= 1 || !data || data[datalen - 1] != '\0')
 		return -EINVAL;
@@ -217,6 +218,19 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m)
 		seq_printf(m, ": %u", key->datalen);
 }
 
+/*
+ * read the DNS data
+ * - the key's semaphore is read-locked
+ */
+static long dns_resolver_read(const struct key *key,
+			      char __user *buffer, size_t buflen)
+{
+	if (key->type_data.x[0])
+		return key->type_data.x[0];
+
+	return user_read(key, buffer, buflen);
+}
+
 struct key_type key_type_dns_resolver = {
 	.name		= "dns_resolver",
 	.instantiate	= dns_resolver_instantiate,
@@ -224,7 +238,7 @@ struct key_type key_type_dns_resolver = {
 	.revoke		= user_revoke,
 	.destroy	= user_destroy,
 	.describe	= dns_resolver_describe,
-	.read		= user_read,
+	.read		= dns_resolver_read,
 };
 
 static int __init init_dns_resolver(void)
-- 
cgit v1.2.2


From 60bf8bf8815e6adea4c1d0423578c3b8000e2ec8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Mar 2011 12:24:28 -0800
Subject: libceph: fix msgr backoff

With commit f363e45f we replaced a bunch of hacky workqueue mutual
exclusion logic with the WQ_NON_REENTRANT flag.  One pieces of fallout is
that the exponential backoff breaks in certain cases:

 * con_work attempts to connect.
 * we get an immediate failure, and the socket state change handler queues
   immediate work.
 * con_work calls con_fault, we decide to back off, but can't queue delayed
   work.

In this case, we add a BACKOFF bit to make con_work reschedule delayed work
next time it runs (which should be immediately).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 6bd5025f6220..46fbc422ba74 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1949,6 +1949,19 @@ static void con_work(struct work_struct *work)
 						   work.work);
 
 	mutex_lock(&con->mutex);
+	if (test_and_clear_bit(BACKOFF, &con->state)) {
+		dout("con_work %p backing off\n", con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay))) {
+			dout("con_work %p backoff %lu\n", con, con->delay);
+			mutex_unlock(&con->mutex);
+			return;
+		} else {
+			con->ops->put(con);
+			dout("con_work %p FAILED to back off %lu\n", con,
+			     con->delay);
+		}
+	}
 
 	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
 		dout("con_work CLOSED\n");
@@ -2017,11 +2030,24 @@ static void ceph_fault(struct ceph_connection *con)
 			con->delay = BASE_DELAY_INTERVAL;
 		else if (con->delay < MAX_DELAY_INTERVAL)
 			con->delay *= 2;
-		dout("fault queueing %p delay %lu\n", con, con->delay);
 		con->ops->get(con);
 		if (queue_delayed_work(ceph_msgr_wq, &con->work,
-				       round_jiffies_relative(con->delay)) == 0)
+				       round_jiffies_relative(con->delay))) {
+			dout("fault queued %p delay %lu\n", con, con->delay);
+		} else {
 			con->ops->put(con);
+			dout("fault failed to queue %p delay %lu, backoff\n",
+			     con, con->delay);
+			/*
+			 * In many cases we see a socket state change
+			 * while con_work is running and end up
+			 * queuing (non-delayed) work, such that we
+			 * can't backoff with a delay.  Set a flag so
+			 * that when con_work restarts we schedule the
+			 * delay then.
+			 */
+			set_bit(BACKOFF, &con->state);
+		}
 	}
 
 out_unlock:
-- 
cgit v1.2.2


From e76661d0a59e53e5cc4dccbe4b755d1dc8a968ec Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Mar 2011 10:10:15 -0800
Subject: libceph: fix msgr keepalive flag

There was some broken keepalive code using a dead variable.  Shift to using
the proper bit flag.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 46fbc422ba74..3252ea974e8f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -336,7 +336,6 @@ static void reset_connection(struct ceph_connection *con)
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	}
-	con->out_keepalive_pending = false;
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 }
@@ -2019,10 +2018,10 @@ static void ceph_fault(struct ceph_connection *con)
 	/* Requeue anything that hasn't been acked */
 	list_splice_init(&con->out_sent, &con->out_queue);
 
-	/* If there are no messages in the queue, place the connection
-	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
-	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-		dout("fault setting STANDBY\n");
+	/* If there are no messages queued or keepalive pending, place
+	 * the connection in a STANDBY state */
+	if (list_empty(&con->out_queue) &&
+	    !test_bit(KEEPALIVE_PENDING, &con->state)) {
 		set_bit(STANDBY, &con->state);
 	} else {
 		/* retry after a delay. */
-- 
cgit v1.2.2


From e00de341fdb76c955703b4438100f9933c452b7f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Mar 2011 12:25:05 -0800
Subject: libceph: fix msgr standby handling

The standby logic used to be pretty dependent on the work requeueing
behavior that changed when we switched to WQ_NON_REENTRANT.  It was also
very fragile.

Restructure things so that:
 - We clear WRITE_PENDING when we set STANDBY.  This ensures we will
   requeue work when we wake up later.
 - con_work backs off if STANDBY is set.  There is nothing to do if we are
   in standby.
 - clear_standby() helper is called by both con_send() and con_keepalive(),
   the two actions that can wake us up again.  Move the connect_seq++
   logic here.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3252ea974e8f..05f357828a2f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1712,14 +1712,6 @@ more:
 
 	/* open the socket first? */
 	if (con->sock == NULL) {
-		/*
-		 * if we were STANDBY and are reconnecting _this_
-		 * connection, bump connect_seq now.  Always bump
-		 * global_seq.
-		 */
-		if (test_and_clear_bit(STANDBY, &con->state))
-			con->connect_seq++;
-
 		prepare_write_banner(msgr, con);
 		prepare_write_connect(msgr, con, 1);
 		prepare_read_banner(con);
@@ -1962,6 +1954,10 @@ static void con_work(struct work_struct *work)
 		}
 	}
 
+	if (test_bit(STANDBY, &con->state)) {
+		dout("con_work %p STANDBY\n", con);
+		goto done;
+	}
 	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
 		dout("con_work CLOSED\n");
 		con_close_socket(con);
@@ -2022,6 +2018,8 @@ static void ceph_fault(struct ceph_connection *con)
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
 	    !test_bit(KEEPALIVE_PENDING, &con->state)) {
+		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
+		clear_bit(WRITE_PENDING, &con->state);
 		set_bit(STANDBY, &con->state);
 	} else {
 		/* retry after a delay. */
@@ -2117,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
 }
 EXPORT_SYMBOL(ceph_messenger_destroy);
 
+static void clear_standby(struct ceph_connection *con)
+{
+	/* come back from STANDBY? */
+	if (test_and_clear_bit(STANDBY, &con->state)) {
+		mutex_lock(&con->mutex);
+		dout("clear_standby %p and ++connect_seq\n", con);
+		con->connect_seq++;
+		WARN_ON(test_bit(WRITE_PENDING, &con->state));
+		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
+		mutex_unlock(&con->mutex);
+	}
+}
+
 /*
  * Queue up an outgoing message on the given connection.
  */
@@ -2149,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
+	clear_standby(con);
 	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
 		queue_con(con);
 }
@@ -2214,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
  */
 void ceph_con_keepalive(struct ceph_connection *con)
 {
+	dout("con_keepalive %p\n", con);
+	clear_standby(con);
 	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
 	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
 		queue_con(con);
-- 
cgit v1.2.2