From 98bdb0aa007ff7e8e0061936d8d0e210faf2e655 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Jan 2011 08:17:48 -0800 Subject: libceph: fix socket read error handling If we get EAGAIN when trying to read from the socket, it is not an error. Return 0 from the helper in this case to simplify the error handling cases in the caller (indirectly, try_read). Fix try_read to pass any error to it's caller (con_work) instead of almost always returning 0. This let's us respond to things like socket disconnects. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index dff633d62e5b..d95576d40c98 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -252,8 +252,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) { struct kvec iov = {buf, len}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; - return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; } /* @@ -1821,19 +1825,17 @@ more: dout("try_read connecting\n"); ret = read_partial_banner(con); if (ret <= 0) - goto done; - if (process_banner(con) < 0) { - ret = -1; goto out; - } + ret = process_banner(con); + if (ret < 0) + goto out; } ret = read_partial_connect(con); if (ret <= 0) - goto done; - if (process_connect(con) < 0) { - ret = -1; goto out; - } + ret = process_connect(con); + if (ret < 0) + goto out; goto more; } @@ -1848,7 +1850,7 @@ more: dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); ret = ceph_tcp_recvmsg(con->sock, buf, skip); if (ret <= 0) - goto done; + goto out; con->in_base_pos += ret; if (con->in_base_pos) goto more; @@ -1859,7 +1861,7 @@ more: */ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); if (ret <= 0) - goto done; + goto out; dout("try_read got tag %d\n", (int)con->in_tag); switch (con->in_tag) { case CEPH_MSGR_TAG_MSG: @@ -1870,7 +1872,7 @@ more: break; case CEPH_MSGR_TAG_CLOSE: set_bit(CLOSED, &con->state); /* fixme */ - goto done; + goto out; default: goto bad_tag; } @@ -1882,13 +1884,12 @@ more: case -EBADMSG: con->error_msg = "bad crc"; ret = -EIO; - goto out; + break; case -EIO: con->error_msg = "io error"; - goto out; - default: - goto done; + break; } + goto out; } if (con->in_tag == CEPH_MSGR_TAG_READY) goto more; @@ -1898,15 +1899,13 @@ more: if (con->in_tag == CEPH_MSGR_TAG_ACK) { ret = read_partial_ack(con); if (ret <= 0) - goto done; + goto out; process_ack(con); goto more; } -done: - ret = 0; out: - dout("try_read done on %p\n", con); + dout("try_read done on %p ret %d\n", con, ret); return ret; bad_tag: -- cgit v1.2.2 From 42961d2333a1855c649fa3790e258ab4f0fa66a4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 25 Jan 2011 08:19:34 -0800 Subject: libceph: fix socket write error handling Pass errors from writing to the socket up the stack. If we get -EAGAIN, return 0 from the helper to simplify the callers' checks. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d95576d40c98..35b36b86d762 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -268,13 +268,17 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, size_t kvlen, size_t len, int more) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; if (more) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - return kernel_sendmsg(sock, &msg, iov, kvlen, len); + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; } @@ -851,6 +855,8 @@ static int write_partial_msg_pages(struct ceph_connection *con) (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); + if (ret == -EAGAIN) + ret = 0; if (ret <= 0) goto out; @@ -1741,16 +1747,12 @@ more_kvec: if (con->out_skip) { ret = write_partial_skip(con); if (ret <= 0) - goto done; - if (ret < 0) { - dout("try_write write_partial_skip err %d\n", ret); - goto done; - } + goto out; } if (con->out_kvec_left) { ret = write_partial_kvec(con); if (ret <= 0) - goto done; + goto out; } /* msg pages? */ @@ -1765,11 +1767,11 @@ more_kvec: if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ if (ret == 0) - goto done; + goto out; if (ret < 0) { dout("try_write write_partial_msg_pages err %d\n", ret); - goto done; + goto out; } } @@ -1793,10 +1795,9 @@ do_next: /* Nothing to do! */ clear_bit(WRITE_PENDING, &con->state); dout("try_write nothing else to write.\n"); -done: ret = 0; out: - dout("try_write done on %p\n", con); + dout("try_write done on %p ret %d\n", con, ret); return ret; } -- cgit v1.2.2 From e733fb62082b3b187870dfba28d5f6730b8436c4 Mon Sep 17 00:00:00 2001 From: Bao Liang Date: Sat, 29 Jan 2011 21:39:37 +0800 Subject: Bluetooth: Set conn state to BT_DISCONN to avoid multiple responses This patch fixes a minor issue that two connection responses will be sent for one L2CAP connection request. If the L2CAP connection request is first blocked due to security reason and responded with reason "security block", the state of the connection remains BT_CONNECT2. If a pairing procedure completes successfully before the ACL connection is down, local host will send another connection complete response. See the following packets captured by hcidump. 2010-12-07 22:21:24.928096 < ACL data: handle 12 flags 0x00 dlen 16 0000: 0c 00 01 00 03 19 08 00 41 00 53 00 03 00 00 00 ........A.S..... ... ... 2010-12-07 22:21:35.791747 > HCI Event: Auth Complete (0x06) plen 3 status 0x00 handle 12 ... ... 2010-12-07 22:21:35.872372 > ACL data: handle 12 flags 0x02 dlen 16 L2CAP(s): Connect rsp: dcid 0x0054 scid 0x0040 result 0 status 0 Connection successful Signed-off-by: Liang Bao Acked-by: Ville Tervo Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 7550abb0c96a..675614e38e14 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -859,6 +859,7 @@ static void __l2cap_sock_close(struct sock *sk, int reason) result = L2CAP_CR_SEC_BLOCK; else result = L2CAP_CR_BAD_PSM; + sk->sk_state = BT_DISCONN; rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); -- cgit v1.2.2 From a7b545f7fe753ca3dc1b51ca57f90cd59d974e44 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 8 Feb 2011 18:43:19 +0200 Subject: mac80211: add missing locking in ieee80211_reconfig When suspending an associated system, and then resuming, the station vif is being reconfigured without taking the sdata->u.mgd.mtx lock, which results in the following warning: WARNING: at net/mac80211/mlme.c:101 ieee80211_ap_probereq_get+0x58/0xb8 [mac80211]() Modules linked in: wl12xx_sdio wl12xx firmware_class crc7 mac80211 cfg80211 [last unloaded: crc7] Backtrace: [] (dump_backtrace+0x0/0x118) from [] (dump_stack+0x20/0x24) r7:00000000 r6:bf12d6ec r5:bf154aac r4:00000065 [] (dump_stack+0x0/0x24) from [] (warn_slowpath_common+0x5c/0x74) [] (warn_slowpath_common+0x0/0x74) from [] (warn_slowpath_null+0x2c/0x34) r9:000024ff r8:cd006460 r7:00000001 r6:00000000 r5:00000000 r4:cf1394a0 [] (warn_slowpath_null+0x0/0x34) from [] (ieee80211_ap_probereq_get+0x58/0xb8 [mac80211]) [] (ieee80211_ap_probereq_get+0x0/0xb8 [mac80211]) from [] (wl1271_cmd_build_ap_probe_req+0x30/0xf8 [wl12xx]) r4:cd007440 [] (wl1271_cmd_build_ap_probe_req+0x0/0xf8 [wl12xx]) from [] (wl1271_op_bss_info_changed+0x4c4/0x808 [wl12xx]) r5:cd007440 r4:000003b4 [] (wl1271_op_bss_info_changed+0x0/0x808 [wl12xx]) from [] (ieee80211_bss_info_change_notify+0x1a4/0x1f8 [mac80211]) [] (ieee80211_bss_info_change_notify+0x0/0x1f8 [mac80211]) from [] (ieee80211_reconfig+0x4d0/0x668 [mac80211]) r8:cf0eeea4 r7:cd00671c r6:00000000 r5:cd006460 r4:cf1394a0 [] (ieee80211_reconfig+0x0/0x668 [mac80211]) from [] (ieee80211_resume+0x60/0x70 [mac80211]) [] (ieee80211_resume+0x0/0x70 [mac80211]) from [] (wiphy_resume+0x6c/0x7c [cfg80211]) r5:cd006248 r4:cd006110 [] (wiphy_resume+0x0/0x7c [cfg80211]) from [] (legacy_resume+0x38/0x70) r7:00000000 r6:00000000 r5:cd006248 r4:cd0062fc [] (legacy_resume+0x0/0x70) from [] (device_resume+0x168/0x1a0) r8:c04ca8d8 r7:cd00627c r6:00000010 r5:cd006248 r4:cd0062fc [] (device_resume+0x0/0x1a0) from [] (dpm_resume_end+0xf8/0x3bc) r7:00000000 r6:00000005 r5:cd006248 r4:cd0062fc [] (dpm_resume_end+0x0/0x3bc) from [] (suspend_devices_and_enter+0x1b0/0x204) [] (suspend_devices_and_enter+0x0/0x204) from [] (enter_state+0xf0/0x148) r7:c037e978 r6:00000003 r5:c043d807 r4:00000000 [] (enter_state+0x0/0x148) from [] (state_store+0xa4/0xcc) r7:c037e978 r6:00000003 r5:00000003 r4:c043d807 [] (state_store+0x0/0xcc) from [] (kobj_attr_store+0x20/0x24) [] (kobj_attr_store+0x0/0x24) from [] (sysfs_write_file+0x11c/0x150) [] (sysfs_write_file+0x0/0x150) from [] (vfs_write+0xc0/0x14c) [] (vfs_write+0x0/0x14c) from [] (sys_write+0x4c/0x78) r8:40126000 r7:00000004 r6:cf1a7c80 r5:00000000 r4:00000000 [] (sys_write+0x0/0x78) from [] (ret_fast_syscall+0x0/0x30) r8:c00502c8 r7:00000004 r6:403525e8 r5:40126000 r4:00000004 Signed-off-by: Eliad Peller Signed-off-by: John W. Linville --- net/mac80211/util.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index cf68700abffa..d036597aabbe 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1210,7 +1210,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: changed |= BSS_CHANGED_ASSOC; + mutex_lock(&sdata->u.mgd.mtx); ieee80211_bss_info_change_notify(sdata, changed); + mutex_unlock(&sdata->u.mgd.mtx); break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; -- cgit v1.2.2 From 0b150932197b185ad5816932912e648116c7a96a Mon Sep 17 00:00:00 2001 From: Hiroaki SHIMODA Date: Thu, 10 Feb 2011 23:08:33 -0800 Subject: xfrm: avoid possible oopse in xfrm_alloc_dst Commit 80c802f3073e84 (xfrm: cache bundles instead of policies for outgoing flows) introduced possible oopse when dst_alloc returns NULL. Signed-off-by: Hiroaki SHIMODA Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8b3ef404c794..6459588befc3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1340,10 +1340,13 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) default: BUG(); } - xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS); + xdst = dst_alloc(dst_ops); xfrm_policy_put_afinfo(afinfo); - xdst->flo.ops = &xfrm_bundle_fc_ops; + if (likely(xdst)) + xdst->flo.ops = &xfrm_bundle_fc_ops; + else + xdst = ERR_PTR(-ENOBUFS); return xdst; } -- cgit v1.2.2 From 946bf5ee3c46f73b5cbd52aab594697b1a132d1f Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 11 Feb 2011 11:21:57 -0800 Subject: ip_gre: Add IPPROTO_GRE to flowi in ipgre_tunnel_xmit Commit 5811662b15db018c740c57d037523683fd3e6123 ("net: use the macros defined for the members of flowi") accidentally removed the setting of IPPROTO_GRE from the struct flowi in ipgre_tunnel_xmit. This patch restores it. Signed-off-by: Steffen Klassert Acked-by: Changli Gao Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index eb68a0e34e49..6613edfac28c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -775,6 +775,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev .fl4_dst = dst, .fl4_src = tiph->saddr, .fl4_tos = RT_TOS(tos), + .proto = IPPROTO_GRE, .fl_gre_key = tunnel->parms.o_key }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { -- cgit v1.2.2 From 6b0d6a9b4296fa16a28d10d416db7a770fc03287 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 11 Feb 2011 12:36:55 +0000 Subject: bridge: Fix mglist corruption that leads to memory corruption The list mp->mglist is used to indicate whether a multicast group is active on the bridge interface itself as opposed to one of the constituent interfaces in the bridge. Unfortunately the operation that adds the mp->mglist node to the list neglected to check whether it has already been added. This leads to list corruption in the form of nodes pointing to itself. Normally this would be quite obvious as it would cause an infinite loop when walking the list. However, as this list is never actually walked (which means that we don't really need it, I'll get rid of it in a subsequent patch), this instead is hidden until we perform a delete operation on the affected nodes. As the same node may now be pointed to by more than one node, the delete operations can then cause modification of freed memory. This was observed in practice to cause corruption in 512-byte slabs, most commonly leading to crashes in jbd2. Thanks to Josef Bacik for pointing me in the right direction. Reported-by: Ian Page Hands Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index f701a21acb34..fdbd41c76ec4 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -719,7 +719,8 @@ static int br_multicast_add_group(struct net_bridge *br, goto err; if (!port) { - hlist_add_head(&mp->mglist, &br->mglist); + if (hlist_unhashed(&mp->mglist)) + hlist_add_head(&mp->mglist, &br->mglist); mod_timer(&mp->timer, now + br->multicast_membership_interval); goto out; } -- cgit v1.2.2 From 24f9cdcbd743fd6adb8fb83688d8d86dcccde662 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 11 Feb 2011 12:42:07 +0000 Subject: bridge: Fix timer typo that may render snooping less effective In a couple of spots where we are supposed to modify the port group timer (p->timer) we instead modify the bridge interface group timer (mp->timer). The effect of this is mostly harmless. However, it can cause port subscriptions to be longer than they should be, thus making snooping less effective. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index fdbd41c76ec4..c558274051eb 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1178,7 +1178,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0) - mod_timer(&mp->timer, now + max_delay); + mod_timer(&p->timer, now + max_delay); } out: @@ -1249,7 +1249,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0) - mod_timer(&mp->timer, now + max_delay); + mod_timer(&p->timer, now + max_delay); } out: -- cgit v1.2.2 From 8a870178c0ad1bae9994c99bd01eb10c9903e616 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 12 Feb 2011 01:05:42 -0800 Subject: bridge: Replace mp->mglist hlist with a bool As it turns out we never need to walk through the list of multicast groups subscribed by the bridge interface itself (the only time we'd want to do that is when we shut down the bridge, in which case we simply walk through all multicast groups), we don't really need to keep an hlist for mp->mglist. This means that we can replace it with just a single bit to indicate whether the bridge interface is subscribed to a group. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_input.c | 2 +- net/bridge/br_multicast.c | 16 +++++++--------- net/bridge/br_private.h | 3 +-- 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 6f6d8e1b776f..88e4aa9cb1f9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -80,7 +80,7 @@ int br_handle_frame_finish(struct sk_buff *skb) if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb); if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) { - if ((mdst && !hlist_unhashed(&mdst->mglist)) || + if ((mdst && mdst->mglist) || br_multicast_is_router(br)) skb2 = skb; br_multicast_forward(mdst, skb, skb2); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index c558274051eb..09d5c0987925 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -232,8 +232,7 @@ static void br_multicast_group_expired(unsigned long data) if (!netif_running(br->dev) || timer_pending(&mp->timer)) goto out; - if (!hlist_unhashed(&mp->mglist)) - hlist_del_init(&mp->mglist); + mp->mglist = false; if (mp->ports) goto out; @@ -276,7 +275,7 @@ static void br_multicast_del_pg(struct net_bridge *br, del_timer(&p->query_timer); call_rcu_bh(&p->rcu, br_multicast_free_pg); - if (!mp->ports && hlist_unhashed(&mp->mglist) && + if (!mp->ports && !mp->mglist && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); @@ -528,7 +527,7 @@ static void br_multicast_group_query_expired(unsigned long data) struct net_bridge *br = mp->br; spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || hlist_unhashed(&mp->mglist) || + if (!netif_running(br->dev) || !mp->mglist || mp->queries_sent >= br->multicast_last_member_count) goto out; @@ -719,8 +718,7 @@ static int br_multicast_add_group(struct net_bridge *br, goto err; if (!port) { - if (hlist_unhashed(&mp->mglist)) - hlist_add_head(&mp->mglist, &br->mglist); + mp->mglist = true; mod_timer(&mp->timer, now + br->multicast_membership_interval); goto out; } @@ -1166,7 +1164,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, max_delay *= br->multicast_last_member_count; - if (!hlist_unhashed(&mp->mglist) && + if (mp->mglist && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) @@ -1237,7 +1235,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, goto out; max_delay *= br->multicast_last_member_count; - if (!hlist_unhashed(&mp->mglist) && + if (mp->mglist && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) @@ -1284,7 +1282,7 @@ static void br_multicast_leave_group(struct net_bridge *br, br->multicast_last_member_interval; if (!port) { - if (!hlist_unhashed(&mp->mglist) && + if (mp->mglist && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : try_to_del_timer_sync(&mp->timer) >= 0)) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 84aac7734bfc..4e1b620b6be6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -84,13 +84,13 @@ struct net_bridge_port_group { struct net_bridge_mdb_entry { struct hlist_node hlist[2]; - struct hlist_node mglist; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; struct rcu_head rcu; struct timer_list timer; struct timer_list query_timer; struct br_ip addr; + bool mglist; u32 queries_sent; }; @@ -238,7 +238,6 @@ struct net_bridge spinlock_t multicast_lock; struct net_bridge_mdb_htable __rcu *mdb; struct hlist_head router_list; - struct hlist_head mglist; struct timer_list multicast_router_timer; struct timer_list multicast_querier_timer; -- cgit v1.2.2 From 7ec79270d7de0c8ca602c47cb25a9652ec28f37f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 31 Jan 2011 12:00:59 +0000 Subject: net: dcb: application priority is per net_device The app_data priority may not be the same for all net devices. In order for stacks with application notifiers to identify the specific net device dcb_app_type should be passed in the ptr. This allows handlers to use dev_get_by_name() to pin priority to net devices. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 6b03f561caec..712ca026040a 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1613,6 +1613,10 @@ EXPORT_SYMBOL(dcb_getapp); u8 dcb_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type *itr; + struct dcb_app_type event; + + memcpy(&event.name, dev->name, sizeof(event.name)); + memcpy(&event.app, new, sizeof(event.app)); spin_lock(&dcb_lock); /* Search for existing match and replace */ @@ -1644,7 +1648,7 @@ u8 dcb_setapp(struct net_device *dev, struct dcb_app *new) } out: spin_unlock(&dcb_lock); - call_dcbevent_notifiers(DCB_APP_EVENT, new); + call_dcbevent_notifiers(DCB_APP_EVENT, &event); return 0; } EXPORT_SYMBOL(dcb_setapp); -- cgit v1.2.2 From d3337de52af7fb0ebe605b02b740be4ee7dee9eb Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 10 Feb 2011 11:57:16 +0000 Subject: Don't potentially dereference NULL in net/dcb/dcbnl.c:dcbnl_getapp() nla_nest_start() may return NULL. If it does then we'll blow up in nla_nest_end() when we dereference the pointer. Signed-off-by: Jesper Juhl Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 712ca026040a..d5074a567289 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -626,6 +626,9 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb, dcb->cmd = DCB_CMD_GAPP; app_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_APP); + if (!app_nest) + goto out_cancel; + ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_IDTYPE, idtype); if (ret) goto out_cancel; -- cgit v1.2.2 From de9963f0f2dfad128b26ae7bf6005f5948416a6d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 14 Feb 2011 17:35:07 +0100 Subject: netfilter: nf_iterate: fix incorrect RCU usage As noticed by Eric, nf_iterate doesn't use RCU correctly by accessing the prev pointer of a RCU protected list element when a verdict of NF_REPEAT is issued. Fix by jumping backwards to the hook invocation directly instead of loading the previous list element before continuing the list iteration. Reported-by: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/netfilter/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 32fcbe290c04..4aa614b8a96a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -133,6 +133,7 @@ unsigned int nf_iterate(struct list_head *head, /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ +repeat: verdict = elem->hook(hook, skb, indev, outdev, okfn); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG @@ -145,7 +146,7 @@ unsigned int nf_iterate(struct list_head *head, #endif if (verdict != NF_REPEAT) return verdict; - *i = (*i)->prev; + goto repeat; } } return NF_ACCEPT; -- cgit v1.2.2 From d11327ad6695db8117c78d70611e71102ceec2ac Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 11 Feb 2011 07:44:16 +0000 Subject: arp_notify: unconditionally send gratuitous ARP for NETDEV_NOTIFY_PEERS. NETDEV_NOTIFY_PEER is an explicit request by the driver to send a link notification while NETDEV_UP/NETDEV_CHANGEADDR generate link notifications as a sort of side effect. In the later cases the sysctl option is present because link notification events can have undesired effects e.g. if the link is flapping. I don't think this applies in the case of an explicit request from a driver. This patch makes NETDEV_NOTIFY_PEER unconditional, if preferred we could add a new sysctl for this case which defaults to on. This change causes Xen post-migration ARP notifications (which cause switches to relearn their MAC tables etc) to be sent by default. Signed-off-by: Ian Campbell Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 748cb5b337bd..df4616fce929 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1030,6 +1030,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu) return mtu >= 68; } +static void inetdev_send_gratuitous_arp(struct net_device *dev, + struct in_device *in_dev) + +{ + struct in_ifaddr *ifa = in_dev->ifa_list; + + if (!ifa) + return; + + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ifa->ifa_address, dev, + ifa->ifa_address, NULL, + dev->dev_addr, NULL); +} + /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, @@ -1082,18 +1097,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } ip_mc_up(in_dev); /* fall through */ - case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEADDR: + if (!IN_DEV_ARP_NOTIFY(in_dev)) + break; + /* fall through */ + case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ - if (IN_DEV_ARP_NOTIFY(in_dev)) { - struct in_ifaddr *ifa = in_dev->ifa_list; - - if (ifa) - arp_send(ARPOP_REQUEST, ETH_P_ARP, - ifa->ifa_address, dev, - ifa->ifa_address, NULL, - dev->dev_addr, NULL); - } + inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); -- cgit v1.2.2 From 840af824b2bf9194ea596e0ddc7aa05066794ca1 Mon Sep 17 00:00:00 2001 From: Vladislav P Date: Mon, 14 Feb 2011 15:21:50 -0200 Subject: Bluetooth: Release BTM while sleeping to avoid deadlock Signed-off-by: Vladislav P Signed-off-by: Gustavo F. Padovan --- net/bluetooth/rfcomm/tty.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 2575c2db6404..d7b9af4703d0 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -727,7 +727,9 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) break; } + tty_unlock(); schedule(); + tty_lock(); } set_current_state(TASK_RUNNING); remove_wait_queue(&dev->wait, &wait); -- cgit v1.2.2 From d503b30bd648b3cb4e5f50b65d27e389960cc6d9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 17 Feb 2011 11:32:38 +0100 Subject: netfilter: tproxy: do not assign timewait sockets to skb->sk Assigning a socket in timewait state to skb->sk can trigger kernel oops, e.g. in nfnetlink_log, which does: if (skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) ... in the timewait case, accessing sk->sk_callback_lock and sk->sk_socket is invalid. Either all of these spots will need to add a test for sk->sk_state != TCP_TIME_WAIT, or xt_TPROXY must not assign a timewait socket to skb->sk. This does the latter. If a TW socket is found, assign the tproxy nfmark, but skip the skb->sk assignment, thus mimicking behaviour of a '-m socket .. -j MARK/ACCEPT' re-routing rule. The 'SYN to TW socket' case is left unchanged -- we try to redirect to the listener socket. Cc: Balazs Scheidler Cc: KOVACS Krisztian Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/netfilter/nf_tproxy_core.c | 27 ++++++++++++--------------- net/netfilter/xt_TPROXY.c | 22 ++++++++++++++++++++-- net/netfilter/xt_socket.c | 13 +++++++++++-- 3 files changed, 43 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index 4d87befb04c0..474d621cbc2e 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -28,26 +28,23 @@ nf_tproxy_destructor(struct sk_buff *skb) skb->destructor = NULL; if (sk) - nf_tproxy_put_sock(sk); + sock_put(sk); } /* consumes sk */ -int +void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) { - bool transparent = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_transparent : - inet_sk(sk)->transparent; - - if (transparent) { - skb_orphan(skb); - skb->sk = sk; - skb->destructor = nf_tproxy_destructor; - return 1; - } else - nf_tproxy_put_sock(sk); - - return 0; + /* assigning tw sockets complicates things; most + * skb->sk->X checks would have to test sk->sk_state first */ + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = nf_tproxy_destructor; } EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 640678f47a2a..dcfd57eb9d02 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -33,6 +33,20 @@ #include #include +static bool tproxy_sk_is_transparent(struct sock *sk) +{ + if (sk->sk_state != TCP_TIME_WAIT) { + if (inet_sk(sk)->transparent) + return true; + sock_put(sk); + } else { + if (inet_twsk(sk)->tw_transparent) + return true; + inet_twsk_put(inet_twsk(sk)); + } + return false; +} + static inline __be32 tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) { @@ -141,7 +155,7 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, skb->dev, NFT_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ - if (sk && nf_tproxy_assign_sock(skb, sk)) { + if (sk && tproxy_sk_is_transparent(sk)) { /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~mark_mask) ^ mark_value; @@ -149,6 +163,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", iph->protocol, &iph->daddr, ntohs(hp->dest), &laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); return NF_ACCEPT; } @@ -306,7 +322,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) par->in, NFT_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ - if (sk && nf_tproxy_assign_sock(skb, sk)) { + if (sk && tproxy_sk_is_transparent(sk)) { /* This should be in a separate target, but we don't do multiple targets on the same rule yet */ skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; @@ -314,6 +330,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", tproto, &iph->saddr, ntohs(hp->source), laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); return NF_ACCEPT; } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 00d6ae838303..9cc46356b577 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -35,6 +35,15 @@ #include #endif +static void +xt_socket_put_sk(struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put(inet_twsk(sk)); + else + sock_put(sk); +} + static int extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, @@ -164,7 +173,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, (sk->sk_state == TCP_TIME_WAIT && inet_twsk(sk)->tw_transparent)); - nf_tproxy_put_sock(sk); + xt_socket_put_sk(sk); if (wildcard || !transparent) sk = NULL; @@ -298,7 +307,7 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) (sk->sk_state == TCP_TIME_WAIT && inet_twsk(sk)->tw_transparent)); - nf_tproxy_put_sock(sk); + xt_socket_put_sk(sk); if (wildcard || !transparent) sk = NULL; -- cgit v1.2.2 From 0af320fb4627033e49cbc6e8138e7aa75ab8352a Mon Sep 17 00:00:00 2001 From: Joerg Marx Date: Thu, 17 Feb 2011 16:23:40 +0100 Subject: netfilter: ip6t_LOG: fix a flaw in printing the MAC The flaw was in skipping the second byte in MAC header due to increasing the pointer AND indexed access starting at '1'. Signed-off-by: Joerg Marx Signed-off-by: Patrick McHardy --- net/ipv6/netfilter/ip6t_LOG.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 09c88891a753..de338037a736 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -410,7 +410,7 @@ fallback: if (p != NULL) { sb_add(m, "%02x", *p++); for (i = 1; i < len; i++) - sb_add(m, ":%02x", p[i]); + sb_add(m, ":%02x", *p++); } sb_add(m, " "); -- cgit v1.2.2 From 214f45c91bbda8321d9676f1197238e4663edcbb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Feb 2011 11:39:01 -0800 Subject: net: provide default_advmss() methods to blackhole dst_ops Commit 0dbaee3b37e118a (net: Abstract default ADVMSS behind an accessor.) introduced a possible crash in tcp_connect_init(), when dst->default_advmss() is called from dst_metric_advmss() Reported-by: George Spelvin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 + net/ipv6/route.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788a3e74834e..6ed6603c2f6d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2722,6 +2722,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, .default_mtu = ipv4_blackhole_default_mtu, + .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1c29f95695de..a998db6e7895 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -128,6 +128,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .destroy = ip6_dst_destroy, .check = ip6_dst_check, .default_mtu = ip6_blackhole_default_mtu, + .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, }; -- cgit v1.2.2 From f87e6f47933e3ebeced9bb12615e830a72cedce4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Feb 2011 22:54:38 +0000 Subject: net: dont leave active on stack LIST_HEAD Eric W. Biderman and Michal Hocko reported various memory corruptions that we suspected to be related to a LIST head located on stack, that was manipulated after thread left function frame (and eventually exited, so its stack was freed and reused). Eric Dumazet suggested the problem was probably coming from commit 443457242beb (net: factorize sync-rcu call in unregister_netdevice_many) This patch fixes __dev_close() and dev_close() to properly deinit their respective LIST_HEAD(single) before exiting. References: https://lkml.org/lkml/2011/2/16/304 References: https://lkml.org/lkml/2011/2/14/223 Reported-by: Michal Hocko Tested-by: Michal Hocko Reported-by: Eric W. Biderman Tested-by: Eric W. Biderman Signed-off-by: Linus Torvalds Signed-off-by: Eric Dumazet CC: Ingo Molnar CC: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8e726cb47ed7..a18c1643ea9f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1280,10 +1280,13 @@ static int __dev_close_many(struct list_head *head) static int __dev_close(struct net_device *dev) { + int retval; LIST_HEAD(single); list_add(&dev->unreg_list, &single); - return __dev_close_many(&single); + retval = __dev_close_many(&single); + list_del(&single); + return retval; } int dev_close_many(struct list_head *head) @@ -1325,7 +1328,7 @@ int dev_close(struct net_device *dev) list_add(&dev->unreg_list, &single); dev_close_many(&single); - + list_del(&single); return 0; } EXPORT_SYMBOL(dev_close); -- cgit v1.2.2 From ceaaec98ad99859ac90ac6863ad0a6cd075d8e0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2011 22:59:19 +0000 Subject: net: deinit automatic LIST_HEAD commit 9b5e383c11b08784 (net: Introduce unregister_netdevice_many()) left an active LIST_HEAD() in rollback_registered(), with possible memory corruption. Even if device is freed without touching its unreg_list (and therefore touching the previous memory location holding LISTE_HEAD(single), better close the bug for good, since its really subtle. (Same fix for default_device_exit_batch() for completeness) Reported-by: Michal Hocko Tested-by: Michal Hocko Reported-by: Eric W. Biderman Tested-by: Eric W. Biderman Signed-off-by: Linus Torvalds Signed-off-by: Eric Dumazet CC: Ingo Molnar CC: Octavian Purdila CC: stable [.33+] Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a18c1643ea9f..8ae6631abcc2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5066,6 +5066,7 @@ static void rollback_registered(struct net_device *dev) list_add(&dev->unreg_list, &single); rollback_registered_many(&single); + list_del(&single); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -6219,6 +6220,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } } unregister_netdevice_many(&dev_kill_list); + list_del(&dev_kill_list); rtnl_unlock(); } -- cgit v1.2.2 From 05e7c99136554789e4cc060a63334ccaa08ad62d Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 18 Feb 2011 09:05:08 +0100 Subject: mac80211: fix conn_mon_timer running after disassociate Low level driver could pass rx frames to us after disassociate, what can lead to run conn_mon_timer by ieee80211_sta_rx_notify(). That is obviously wrong, but nothing happens until we unload modules and resources are used after free. If kernel debugging is enabled following warning could be observed: WARNING: at lib/debugobjects.c:259 debug_print_object+0x65/0x70() Hardware name: HP xw8600 Workstation ODEBUG: free active (active state 0) object type: timer_list Modules linked in: iwlagn(-) iwlcore mac80211 cfg80211 aes_x86_64 aes_generic fuse cpufreq_ondemand acpi_cpufreq freq_table mperf xt_physdev ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ipv6 ext3 jbd dm_mirror dm_region_hash dm_log dm_mod uinput hp_wmi sparse_keymap sg wmi arc4 microcode serio_raw ecb tg3 shpchp rfkill ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif firewire_ohci firewire_core crc_itu_t mptsas mptscsih mptbase scsi_transport_sas ahci libahci pata_acpi ata_generic ata_piix floppy nouveau ttm drm_kms_helper drm i2c_algo_bit i2c_core video [last unloaded: cfg80211] Pid: 13827, comm: rmmod Tainted: G W 2.6.38-rc4-wl+ #22 Call Trace: [] ? warn_slowpath_common+0x7f/0xc0 [] ? warn_slowpath_fmt+0x46/0x50 [] ? debug_print_object+0x65/0x70 [] ? debug_check_no_obj_freed+0x125/0x210 [] ? debug_check_no_locks_freed+0xf7/0x170 [] ? kfree+0xc2/0x2f0 [] ? netdev_release+0x45/0x60 [] ? device_release+0x27/0xa0 [] ? kobject_release+0x8d/0x1a0 [] ? kobject_release+0x0/0x1a0 [] ? kref_put+0x37/0x70 [] ? kobject_put+0x27/0x60 [] ? netdev_run_todo+0x1ab/0x270 [] ? rtnl_unlock+0xe/0x10 [] ? ieee80211_unregister_hw+0x58/0x120 [mac80211] [] ? iwl_pci_remove+0xdb/0x22a [iwlagn] [] ? pci_device_remove+0x52/0x120 [] ? __device_release_driver+0x75/0xe0 [] ? driver_detach+0xd8/0xe0 [] ? bus_remove_driver+0x91/0x100 [] ? driver_unregister+0x62/0xa0 [] ? pci_unregister_driver+0x44/0xa0 [] ? iwl_exit+0x15/0x1c [iwlagn] [] ? sys_delete_module+0x1a2/0x270 [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] ? system_call_fastpath+0x16/0x1b Acked-by: Johannes Berg Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 45fbb9e33746..c9ceb4d57ab0 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1033,6 +1033,12 @@ void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, if (is_multicast_ether_addr(hdr->addr1)) return; + /* + * In case we receive frames after disassociation. + */ + if (!sdata->u.mgd.associated) + return; + ieee80211_sta_reset_conn_monitor(sdata); } -- cgit v1.2.2 From 91035f0b7d89291af728b6f3e370c3be58fcbe1b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Feb 2011 22:35:56 +0000 Subject: tcp: fix inet_twsk_deschedule() Eric W. Biederman reported a lockdep splat in inet_twsk_deschedule() This is caused by inet_twsk_purge(), run from process context, and commit 575f4cd5a5b6394577 (net: Use rcu lookups in inet_twsk_purge.) removed the BH disabling that was necessary. Add the BH disabling but fine grained, right before calling inet_twsk_deschedule(), instead of whole function. With help from Linus Torvalds and Eric W. Biederman Reported-by: Eric W. Biederman Signed-off-by: Eric Dumazet CC: Daniel Lezcano CC: Pavel Emelyanov CC: Arnaldo Carvalho de Melo CC: stable (# 2.6.33+) Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c5af909cf701..3c8dfa16614d 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -505,7 +505,9 @@ restart: } rcu_read_unlock(); + local_bh_disable(); inet_twsk_deschedule(tw, twdr); + local_bh_enable(); inet_twsk_put(tw); goto restart_rcu; } -- cgit v1.2.2 From 2205a6ea93fea76f88b43727fea53f3ce3790d6f Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 17 Feb 2011 13:12:08 +0000 Subject: sctp: fix reporting of unknown parameters commit 5fa782c2f5ef6c2e4f04d3e228412c9b4a4c8809 re-worked the handling of unknown parameters. sctp_init_cause_fixed() can now return -ENOSPC if there is not enough tailroom in the error chunk skb. When this happens, the error header is not appended to the error chunk. In that case, the payload of the unknown parameter should not be appended either. Signed-off-by: Jiri Bohac Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 2cc46f0962ca..b23428f3c0dd 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2029,11 +2029,11 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, *errp = sctp_make_op_error_fixed(asoc, chunk); if (*errp) { - sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, - WORD_ROUND(ntohs(param.p->length))); - sctp_addto_chunk_fixed(*errp, - WORD_ROUND(ntohs(param.p->length)), - param.v); + if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, + WORD_ROUND(ntohs(param.p->length)))) + sctp_addto_chunk_fixed(*errp, + WORD_ROUND(ntohs(param.p->length)), + param.v); } else { /* If there is no memory for generating the ERROR * report as specified, an ABORT will be triggered -- cgit v1.2.2 From 5f04d5068a90602b93a7953e9a47c496705c6976 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 20 Feb 2011 11:49:45 -0800 Subject: net: Fix more stale on-stack list_head objects. From: Eric W. Biederman In the beginning with batching unreg_list was a list that was used only once in the lifetime of a network device (I think). Now we have calls using the unreg_list that can happen multiple times in the life of a network device like dev_deactivate and dev_close that are also using the unreg_list. In addition in unregister_netdevice_queue we also do a list_move because for devices like veth pairs it is possible that unregister_netdevice_queue will be called multiple times. So I think the change below to fix dev_deactivate which Eric D. missed will fix this problem. Now to go test that. Signed-off-by: David S. Miller --- net/mac80211/iface.c | 1 + net/sched/sch_generic.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 8acba456744e..7a10a8d1b2d0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1229,6 +1229,7 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) } mutex_unlock(&local->iflist_mtx); unregister_netdevice_many(&unreg_list); + list_del(&unreg_list); } static u32 ieee80211_idle_off(struct ieee80211_local *local, diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 34dc598440a2..1bc698039ae2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -839,6 +839,7 @@ void dev_deactivate(struct net_device *dev) list_add(&dev->unreg_list, &single); dev_deactivate_many(&single); + list_del(&single); } static void dev_init_scheduler_queue(struct net_device *dev, -- cgit v1.2.2 From c24f691b56107feeba076616982093ee2d3c8fb5 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 7 Feb 2011 12:57:04 +0000 Subject: tcp: undo_retrans counter fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a bug that undo_retrans is incorrectly decremented when undo_marker is not set or undo_retrans is already 0. This happens when sender receives more DSACK ACKs than packets retransmitted during the current undo phase. This may also happen when sender receives DSACK after the undo operation is completed or cancelled. Fix another bug that undo_retrans is incorrectly incremented when sender retransmits an skb and tcp_skb_pcount(skb) > 1 (TSO). This case is rare but not impossible. Signed-off-by: Yuchung Cheng Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 +++-- net/ipv4/tcp_output.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb7f82ebf4a3..65f6c0406245 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1222,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && + if (dup_sack && tp->undo_marker && tp->undo_retrans && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1299,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + if (tp->undo_marker && tp->undo_retrans && + after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 406f320336e6..dfa5beb0c1c8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2162,7 +2162,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans++; + tp->undo_retrans += tcp_skb_pcount(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). -- cgit v1.2.2 From 4f919a3bc54da01db829c520ce4b1fabfde1c3f7 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 22 Feb 2011 00:11:06 +0800 Subject: fix cfg80211_wext_siwfreq lock ordering... I previously managed to reproduce a hang while scanning wireless channels (reproducible with airodump-ng hopping channels); subsequent lockdep instrumentation revealed a lock ordering issue. Without knowing the design intent, it looks like the locks should be taken in reverse order; please comment. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.38-rc5-341cd #4 ------------------------------------------------------- airodump-ng/15445 is trying to acquire lock: (&rdev->devlist_mtx){+.+.+.}, at: [] cfg80211_wext_siwfreq+0xc6/0x100 but task is already holding lock: (&wdev->mtx){+.+.+.}, at: [] cfg80211_wext_siwfreq+0xbc/0x100 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&wdev->mtx){+.+.+.}: [] lock_acquire+0xc6/0x280 [] mutex_lock_nested+0x6e/0x4b0 [] cfg80211_netdev_notifier_call+0x430/0x5f0 [] notifier_call_chain+0x8b/0x100 [] raw_notifier_call_chain+0x11/0x20 [] call_netdevice_notifiers+0x32/0x60 [] __dev_notify_flags+0x34/0x80 [] dev_change_flags+0x40/0x70 [] do_setlink+0x1fc/0x8d0 [] rtnl_setlink+0xf2/0x140 [] rtnetlink_rcv_msg+0x163/0x270 [] netlink_rcv_skb+0xa1/0xd0 [] rtnetlink_rcv+0x20/0x30 [] netlink_unicast+0x2ba/0x300 [] netlink_sendmsg+0x267/0x3e0 [] sock_sendmsg+0xe4/0x110 [] sys_sendmsg+0x253/0x3b0 [] system_call_fastpath+0x16/0x1b -> #0 (&rdev->devlist_mtx){+.+.+.}: [] __lock_acquire+0x1622/0x1d10 [] lock_acquire+0xc6/0x280 [] mutex_lock_nested+0x6e/0x4b0 [] cfg80211_wext_siwfreq+0xc6/0x100 [] ioctl_standard_call+0x5d/0xd0 [] T.808+0x163/0x170 [] wext_handle_ioctl+0x3a/0x90 [] dev_ioctl+0x6f2/0x830 [] sock_ioctl+0xfd/0x290 [] do_vfs_ioctl+0x9d/0x590 [] sys_ioctl+0x4a/0x80 [] system_call_fastpath+0x16/0x1b other info that might help us debug this: 2 locks held by airodump-ng/15445: #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x12/0x20 #1: (&wdev->mtx){+.+.+.}, at: [] cfg80211_wext_siwfreq+0xbc/0x100 stack backtrace: Pid: 15445, comm: airodump-ng Not tainted 2.6.38-rc5-341cd #4 Call Trace: [] ? print_circular_bug+0xfa/0x100 [] ? __lock_acquire+0x1622/0x1d10 [] ? trace_hardirqs_off_caller+0x29/0xc0 [] ? lock_acquire+0xc6/0x280 [] ? cfg80211_wext_siwfreq+0xc6/0x100 [] ? mark_held_locks+0x67/0x90 [] ? mutex_lock_nested+0x6e/0x4b0 [] ? cfg80211_wext_siwfreq+0xc6/0x100 [] ? mark_held_locks+0x67/0x90 [] ? cfg80211_wext_siwfreq+0xc6/0x100 [] ? cfg80211_wext_siwfreq+0xc6/0x100 [] ? ioctl_standard_call+0x5d/0xd0 [] ? __dev_get_by_name+0x9b/0xc0 [] ? ioctl_standard_call+0x0/0xd0 [] ? T.808+0x163/0x170 [] ? might_fault+0x72/0xd0 [] ? wext_handle_ioctl+0x3a/0x90 [] ? might_fault+0xbb/0xd0 [] ? dev_ioctl+0x6f2/0x830 [] ? put_lock_stats+0xe/0x40 [] ? lock_release_holdtime+0xac/0x150 [] ? sock_ioctl+0xfd/0x290 [] ? do_vfs_ioctl+0x9d/0x590 [] ? fget_light+0x1df/0x3c0 [] ? sys_ioctl+0x4a/0x80 [] ? system_call_fastpath+0x16/0x1b Signed-off-by: Daniel J Blueman Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/wext-compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 3e5dbd4e4cd5..d112f038edf0 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -802,11 +802,11 @@ int cfg80211_wext_siwfreq(struct net_device *dev, return freq; if (freq == 0) return -EINVAL; - wdev_lock(wdev); mutex_lock(&rdev->devlist_mtx); + wdev_lock(wdev); err = cfg80211_set_freq(rdev, wdev, freq, NL80211_CHAN_NO_HT); - mutex_unlock(&rdev->devlist_mtx); wdev_unlock(wdev); + mutex_unlock(&rdev->devlist_mtx); return err; default: return -EOPNOTSUPP; -- cgit v1.2.2 From 9cc6e0c4c457f84bedcfb04e7dd58a36909c4ef7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 15 Feb 2011 13:19:17 +0000 Subject: bridge: Fix IPv6 multicast snooping by storing correct protocol type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The protocol type for IPv6 entries in the hash table for multicast bridge snooping is falsely set to ETH_P_IP, marking it as an IPv4 address, instead of setting it to ETH_P_IPV6, which results in negative look-ups in the hash table later. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 09d5c0987925..17708fccf1ee 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -784,7 +784,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, return 0; ipv6_addr_copy(&br_group.u.ip6, group); - br_group.proto = htons(ETH_P_IP); + br_group.proto = htons(ETH_P_IPV6); return br_multicast_add_group(br, port, &br_group); } -- cgit v1.2.2 From 649e984d00416cb1a254fdbebd6d3f9fa01c32fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 15 Feb 2011 13:19:18 +0000 Subject: bridge: Fix IPv6 multicast snooping by correcting offset in MLDv2 report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We actually want a pointer to the grec_nsrcr and not the following field. Otherwise we can get very high values for *nsrcs as the first two bytes of the IPv6 multicast address are being used instead, leading to a failing pskb_may_pull() which results in MLDv2 reports not being parsed. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 17708fccf1ee..d69beaf83627 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1013,7 +1013,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, nsrcs = skb_header_pointer(skb, len + offsetof(struct mld2_grec, - grec_mca), + grec_nsrcs), sizeof(_nsrcs), &_nsrcs); if (!nsrcs) return -EINVAL; -- cgit v1.2.2 From d41db9f3f71548f07b8b6d81a88220d0035b04f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 15 Feb 2011 13:19:19 +0000 Subject: bridge: Add missing ntohs()s for MLDv2 report parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nsrcs number is 2 Byte wide, therefore we need to call ntohs() before using it. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index d69beaf83627..9ce2af187709 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1020,11 +1020,12 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, if (!pskb_may_pull(skb, len + sizeof(*grec) + - sizeof(struct in6_addr) * (*nsrcs))) + sizeof(struct in6_addr) * ntohs(*nsrcs))) return -EINVAL; grec = (struct mld2_grec *)(skb->data + len); - len += sizeof(*grec) + sizeof(struct in6_addr) * (*nsrcs); + len += sizeof(*grec) + + sizeof(struct in6_addr) * ntohs(*nsrcs); /* We treat these as MLDv1 reports for now. */ switch (grec->grec_type) { -- cgit v1.2.2 From e4de9f9e8333fbbae951c6e068f501f955123cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Tue, 15 Feb 2011 13:19:21 +0000 Subject: bridge: Allow mcast snooping for transient link local addresses too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the multicast bridge snooping support is not active for link local multicast. I assume this has been done to leave important multicast data untouched, like IPv6 Neighborhood Discovery. In larger, bridged, local networks it could however be desirable to optimize for instance local multicast audio/video streaming too. With the transient flag in IPv6 multicast addresses we have an easy way to optimize such multimedia traffic without tempering with the high priority multicast data from well-known addresses. This patch alters the multicast bridge snooping for IPv6, to take effect for transient multicast addresses instead of non-link-local addresses. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 9ce2af187709..1207a5a0688a 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -37,10 +37,9 @@ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static inline int ipv6_is_local_multicast(const struct in6_addr *addr) +static inline int ipv6_is_transient_multicast(const struct in6_addr *addr) { - if (ipv6_addr_is_multicast(addr) && - IPV6_ADDR_MC_SCOPE(addr) <= IPV6_ADDR_SCOPE_LINKLOCAL) + if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr)) return 1; return 0; } @@ -780,7 +779,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, { struct br_ip br_group; - if (ipv6_is_local_multicast(group)) + if (!ipv6_is_transient_multicast(group)) return 0; ipv6_addr_copy(&br_group.u.ip6, group); @@ -1341,7 +1340,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, { struct br_ip br_group; - if (ipv6_is_local_multicast(group)) + if (!ipv6_is_transient_multicast(group)) return; ipv6_addr_copy(&br_group.u.ip6, group); -- cgit v1.2.2 From 36cff5a10c6b003fa2d0464848d5664b2bf723e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 17 Feb 2011 08:17:51 +0000 Subject: bridge: Fix MLD queries' ethernet source address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map the IPv6 header's destination multicast address to an ethernet source address instead of the MLD queries multicast address. For instance for a general MLD query (multicast address in the MLD query set to ::), this would wrongly be mapped to 33:33:00:00:00:00, although an MLD queries destination MAC should always be 33:33:00:00:00:01 which matches the IPv6 header's multicast destination ff02::1. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 1207a5a0688a..c1f24e4d0820 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -434,7 +434,6 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, eth = eth_hdr(skb); memcpy(eth->h_source, br->dev->dev_addr, 6); - ipv6_eth_mc_map(group, eth->h_dest); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); @@ -448,6 +447,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->hop_limit = 1; ipv6_addr_set(&ip6h->saddr, 0, 0, 0, 0); ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); + ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); hopopt[0] = IPPROTO_ICMPV6; /* next hdr */ -- cgit v1.2.2 From fe29ec41aaa51902aebd63658dfb04fe6fea8be5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 17 Feb 2011 08:17:52 +0000 Subject: bridge: Use IPv6 link-local address for multicast listener queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the bridge multicast snooping feature periodically issues IPv6 general multicast listener queries to sense the absence of a listener. For this, it uses :: as its source address - however RFC 2710 requires: "To be valid, the Query message MUST come from a link-local IPv6 Source Address". Current Linux kernel versions seem to follow this requirement and ignore our bogus MLD queries. With this commit a link local address from the bridge interface is being used to issue the MLD query, resulting in other Linux devices which are multicast listeners in the network to respond with a MLD response (which was not the case before). Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index c1f24e4d0820..030a002ff8ee 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -445,7 +445,8 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->payload_len = htons(8 + sizeof(*mldq)); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; - ipv6_addr_set(&ip6h->saddr, 0, 0, 0, 0); + ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, + &ip6h->saddr); ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); -- cgit v1.2.2 From c486da34390846b430896a407b47f0cea3a4189c Mon Sep 17 00:00:00 2001 From: Lucian Adrian Grijincu Date: Thu, 24 Feb 2011 19:48:03 +0000 Subject: sysctl: ipv6: use correct net in ipv6_sysctl_rtcache_flush Before this patch issuing these commands: fd = open("/proc/sys/net/ipv6/route/flush") unshare(CLONE_NEWNET) write(fd, "stuff") would flush the newly created net, not the original one. The equivalent ipv4 code is correct (stores the net inside ->extra1). Acked-by: Daniel Lezcano Signed-off-by: David S. Miller --- net/ipv6/route.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a998db6e7895..904312e25a3c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2557,14 +2557,16 @@ static int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; - int delay = net->ipv6.sysctl.flush_delay; - if (write) { - proc_dointvec(ctl, write, buffer, lenp, ppos); - fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); - return 0; - } else + struct net *net; + int delay; + if (!write) return -EINVAL; + + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; + proc_dointvec(ctl, write, buffer, lenp, ppos); + fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); + return 0; } ctl_table ipv6_route_table_template[] = { @@ -2651,6 +2653,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) if (table) { table[0].data = &net->ipv6.sysctl.flush_delay; + table[0].extra1 = net; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; -- cgit v1.2.2 From 0a93ea2e897bd793cc0aaaddc397eff32ac8d6fe Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 25 Feb 2011 15:33:17 +0000 Subject: RxRPC: Allocate tokens with kzalloc to avoid oops in rxrpc_destroy With slab poisoning enabled, I see the following oops: Unable to handle kernel paging request for data at address 0x6b6b6b6b6b6b6b73 ... NIP [c0000000006bc61c] .rxrpc_destroy+0x44/0x104 LR [c0000000006bc618] .rxrpc_destroy+0x40/0x104 Call Trace: [c0000000feb2bc00] [c0000000006bc618] .rxrpc_destroy+0x40/0x104 (unreliable) [c0000000feb2bc90] [c000000000349b2c] .key_cleanup+0x1a8/0x20c [c0000000feb2bd40] [c0000000000a2920] .process_one_work+0x2f4/0x4d0 [c0000000feb2be00] [c0000000000a2d50] .worker_thread+0x254/0x468 [c0000000feb2bec0] [c0000000000a868c] .kthread+0xbc/0xc8 [c0000000feb2bf90] [c000000000020e00] .kernel_thread+0x54/0x70 We aren't initialising token->next, but the code in destroy_context relies on the list being NULL terminated. Use kzalloc to zero out all the fields. Signed-off-by: Anton Blanchard Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- net/rxrpc/ar-key.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 5ee16f0353fe..d763793d39de 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -89,11 +89,11 @@ static int rxrpc_instantiate_xdr_rxkad(struct key *key, const __be32 *xdr, return ret; plen -= sizeof(*token); - token = kmalloc(sizeof(*token), GFP_KERNEL); + token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) return -ENOMEM; - token->kad = kmalloc(plen, GFP_KERNEL); + token->kad = kzalloc(plen, GFP_KERNEL); if (!token->kad) { kfree(token); return -ENOMEM; @@ -731,10 +731,10 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) goto error; ret = -ENOMEM; - token = kmalloc(sizeof(*token), GFP_KERNEL); + token = kzalloc(sizeof(*token), GFP_KERNEL); if (!token) goto error; - token->kad = kmalloc(plen, GFP_KERNEL); + token->kad = kzalloc(plen, GFP_KERNEL); if (!token->kad) goto error_free; -- cgit v1.2.2 From 5aca1a9e880e06bb7e5fd553a86a330ae7e218b5 Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Fri, 25 Feb 2011 13:58:54 -0800 Subject: net: handle addr_type of 0 properly addr_type of 0 means that the type should be adopted from from_dev and not from __hw_addr_del_multiple(). Unfortunately it isn't so and addr_type will always be considered. Fix this by implementing the considered and documented behavior. Signed-off-by: Hagen Paul Pfeifer Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 508f9c18992f..133fd22ea287 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -144,7 +144,7 @@ void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, list_for_each_entry(ha, &from_list->list, list) { type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, addr_type); + __hw_addr_del(to_list, ha->addr, addr_len, type); } } EXPORT_SYMBOL(__hw_addr_del_multiple); -- cgit v1.2.2 From b44d211e166b4b0dae8ce379f9d2e3ac164b5b60 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 21 Feb 2011 02:40:47 +0000 Subject: netlink: handle errors from netlink_dump() netlink_dump() may failed, but nobody handle its error. It generates output data, when a previous portion has been returned to user space. This mechanism works when all data isn't go in skb. If we enter in netlink_recvmsg() and skb is absent in the recv queue, the netlink_dump() will not been executed. So if netlink_dump() is failed one time, the new data never appear and the reader will sleep forever. netlink_dump() is called from two places: 1. from netlink_sendmsg->...->netlink_dump_start(). In this place we can report error directly and it will be returned by sendmsg(). 2. from netlink_recvmsg There we can't report error directly, because we have a portion of valid output data and call netlink_dump() for prepare the next portion. If netlink_dump() is failed, the socket will be mark as error and the next recvmsg will be failed. Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 478181d53c55..1f924595bdef 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1407,7 +1407,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, int noblock = flags&MSG_DONTWAIT; size_t copied; struct sk_buff *skb, *data_skb; - int err; + int err, ret; if (flags&MSG_OOB) return -EOPNOTSUPP; @@ -1470,8 +1470,13 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, skb_free_datagram(sk, skb); - if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) - netlink_dump(sk); + if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { + ret = netlink_dump(sk); + if (ret) { + sk->sk_err = ret; + sk->sk_error_report(sk); + } + } scm_recv(sock, msg, siocb->scm, flags); out: @@ -1736,6 +1741,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, struct netlink_callback *cb; struct sock *sk; struct netlink_sock *nlk; + int ret; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (cb == NULL) @@ -1764,9 +1770,13 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, nlk->cb = cb; mutex_unlock(nlk->cb_mutex); - netlink_dump(sk); + ret = netlink_dump(sk); + sock_put(sk); + if (ret) + return ret; + /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ -- cgit v1.2.2 From ff75f40f44ae9b79d520bf32a05d35af74a805c0 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Feb 2011 10:40:25 +0200 Subject: ipvs: fix dst_lock locking on dest update Fix dst_lock usage in __ip_vs_update_dest. We need _bh locking because destination is updated in user context. Can cause lockups on frequent destination updates. Problem reported by Simon Kirby. Bug was introduced in 2.6.37 from the "ipvs: changes for local real server" change. Signed-off-by: Julian Anastasov Signed-off-by: Hans Schillstrom Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 22f7ad5101ab..ba98e1308f3c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -808,9 +808,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; - spin_lock(&dest->dst_lock); + spin_lock_bh(&dest->dst_lock); ip_vs_dst_reset(dest); - spin_unlock(&dest->dst_lock); + spin_unlock_bh(&dest->dst_lock); if (add) ip_vs_new_estimator(&dest->stats); -- cgit v1.2.2 From 720dc34bbbe9493c7bd48b2243058b4e447a929d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 1 Mar 2011 23:02:07 -0800 Subject: dccp: fix oops on Reset after close This fixes a bug in the order of dccp_rcv_state_process() that still permitted reception even after closing the socket. A Reset after close thus causes a NULL pointer dereference by not preventing operations on an already torn-down socket. dccp_v4_do_rcv() | | state other than OPEN v dccp_rcv_state_process() | | DCCP_PKT_RESET v dccp_rcv_reset() | v dccp_time_wait() WARNING: at net/ipv4/inet_timewait_sock.c:141 __inet_twsk_hashdance+0x48/0x128() Modules linked in: arc4 ecb carl9170 rt2870sta(C) mac80211 r8712u(C) crc_ccitt ah [] (unwind_backtrace+0x0/0xec) from [] (warn_slowpath_common) [] (warn_slowpath_common+0x4c/0x64) from [] (warn_slowpath_n) [] (warn_slowpath_null+0x1c/0x24) from [] (__inet_twsk_hashd) [] (__inet_twsk_hashdance+0x48/0x128) from [] (dccp_time_wai) [] (dccp_time_wait+0x40/0xc8) from [] (dccp_rcv_state_proces) [] (dccp_rcv_state_process+0x120/0x538) from [] (dccp_v4_do_) [] (dccp_v4_do_rcv+0x11c/0x14c) from [] (release_sock+0xac/0) [] (release_sock+0xac/0x110) from [] (dccp_close+0x28c/0x380) [] (dccp_close+0x28c/0x380) from [] (inet_release+0x64/0x70) The fix is by testing the socket state first. Receiving a packet in Closed state now also produces the required "No connection" Reset reply of RFC 4340, 8.3.1. Reported-and-tested-by: Johan Hovold Cc: stable@kernel.org Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/input.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dccp/input.c b/net/dccp/input.c index 8cde009e8b85..4222e7a654b0 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -614,6 +614,9 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Caller (dccp_v4_do_rcv) will send Reset */ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; + } else if (sk->sk_state == DCCP_CLOSED) { + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + return 1; } if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { @@ -668,10 +671,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } switch (sk->sk_state) { - case DCCP_CLOSED: - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - case DCCP_REQUESTING: queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) -- cgit v1.2.2 From 9ef0298a8e5730d9a46d640014c727f3b4152870 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 2 Mar 2011 12:10:13 +0100 Subject: netfilter: nf_log: avoid oops in (un)bind with invalid nfproto values Like many other places, we have to check that the array index is within allowed limits, or otherwise, a kernel oops and other nastiness can ensue when we access memory beyond the end of the array. [ 5954.115381] BUG: unable to handle kernel paging request at 0000004000000000 [ 5954.120014] IP: __find_logger+0x6f/0xa0 [ 5954.123979] nf_log_bind_pf+0x2b/0x70 [ 5954.123979] nfulnl_recv_config+0xc0/0x4a0 [nfnetlink_log] [ 5954.123979] nfnetlink_rcv_msg+0x12c/0x1b0 [nfnetlink] ... The problem goes back to v2.6.30-rc1~1372~1342~31 where nf_log_bind was decoupled from nf_log_register. Reported-by: Miguel Di Ciurcio Filho , via irc.freenode.net/#netfilter Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/nf_log.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index b07393eab88e..91816998ed86 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -85,6 +85,8 @@ EXPORT_SYMBOL(nf_log_unregister); int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) { + if (pf >= ARRAY_SIZE(nf_loggers)) + return -EINVAL; mutex_lock(&nf_log_mutex); if (__find_logger(pf, logger->name) == NULL) { mutex_unlock(&nf_log_mutex); @@ -98,6 +100,8 @@ EXPORT_SYMBOL(nf_log_bind_pf); void nf_log_unbind_pf(u_int8_t pf) { + if (pf >= ARRAY_SIZE(nf_loggers)) + return; mutex_lock(&nf_log_mutex); rcu_assign_pointer(nf_loggers[pf], NULL); mutex_unlock(&nf_log_mutex); -- cgit v1.2.2 From f3d7bc57c71eba3f279785111bb473b1ef68dcb6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 2 Mar 2011 10:35:33 +0000 Subject: net: dcbnl: check correct ops in dcbnl_ieee_set() The incorrect ops routine was being tested for in DCB_ATTR_IEEE_PFC attributes. This patch corrects it. Currently, every driver implementing ieee_setets also implements ieee_setpfc so this bug is not actualized yet. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index d5074a567289..c44348adba3b 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1193,7 +1193,7 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb, goto err; } - if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setets) { + if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); if (err) -- cgit v1.2.2 From 10003453479ef287a73f8a39593f8f42687ea565 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 28 Feb 2011 03:27:43 +0000 Subject: AF_RXRPC: Handle receiving ACKALL packets The OpenAFS server is now sending ACKALL packets, so we need to handle them. Otherwise we report a protocol error and abort. Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/ar-input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 89315009bab1..1a2b0633fece 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -423,6 +423,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) goto protocol_error; } + case RXRPC_PACKET_TYPE_ACKALL: case RXRPC_PACKET_TYPE_ACK: /* ACK processing is done in process context */ read_lock_bh(&call->state_lock); -- cgit v1.2.2 From 38815b780285a4957852c5c9dbe94991c0b26c56 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 2 Mar 2011 16:55:21 -0800 Subject: libceph: fix handling of short returns from get_user_pages get_user_pages() can return fewer pages than we ask for. We were returning a bogus pointer/error code in that case. Instead, loop until we get all the pages we want or get an error we can return to the caller. Signed-off-by: Sage Weil --- net/ceph/pagevec.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 1a040e64c69f..cd9c21df87d1 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -16,22 +16,30 @@ struct page **ceph_get_direct_page_vector(const char __user *data, int num_pages, bool write_page) { struct page **pages; - int rc; + int got = 0; + int rc = 0; pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); if (!pages) return ERR_PTR(-ENOMEM); down_read(¤t->mm->mmap_sem); - rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, write_page, 0, pages, NULL); + while (got < num_pages) { + rc = get_user_pages(current, current->mm, + (unsigned long)data + ((unsigned long)got * PAGE_SIZE), + num_pages - got, write_page, 0, pages + got, NULL); + if (rc < 0) + break; + BUG_ON(rc == 0); + got += rc; + } up_read(¤t->mm->mmap_sem); - if (rc < num_pages) + if (rc < 0) goto fail; return pages; fail: - ceph_put_page_vector(pages, rc > 0 ? rc : 0, false); + ceph_put_page_vector(pages, got, false); return ERR_PTR(rc); } EXPORT_SYMBOL(ceph_get_direct_page_vector); -- cgit v1.2.2 From 692d20f576fb26f62c83f80dbf3ea899998391b7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Mar 2011 12:14:53 -0800 Subject: libceph: retry after authorization failure If we mark the connection CLOSED we will give up trying to reconnect to this server instance. That is appropriate for things like a protocol version mismatch that won't change until the server is restarted, at which point we'll get a new addr and reconnect. An authorization failure like this is probably due to the server not properly rotating it's secret keys, however, and should be treated as transient so that the normal backoff and retry behavior kicks in. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 35b36b86d762..6bd5025f6220 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1248,8 +1248,6 @@ static int process_connect(struct ceph_connection *con) con->auth_retry); if (con->auth_retry == 2) { con->error_msg = "connect authorization failure"; - reset_connection(con); - set_bit(CLOSED, &con->state); return -1; } con->auth_retry = 1; -- cgit v1.2.2 From 1362fa078dae16776cd439791c6605b224ea6171 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Mar 2011 11:28:58 +0000 Subject: DNS: Fix a NULL pointer deref when trying to read an error key [CVE-2011-1076] When a DNS resolver key is instantiated with an error indication, attempts to read that key will result in an oops because user_read() is expecting there to be a payload - and there isn't one [CVE-2011-1076]. Give the DNS resolver key its own read handler that returns the error cached in key->type_data.x[0] as an error rather than crashing. Also make the kenter() at the beginning of dns_resolver_instantiate() limit the amount of data it prints, since the data is not necessarily NUL-terminated. The buggy code was added in: commit 4a2d789267e00b5a1175ecd2ddefcc78b83fbf09 Author: Wang Lei Date: Wed Aug 11 09:37:58 2010 +0100 Subject: DNS: If the DNS server returns an error, allow that to be cached [ver #2] This can trivially be reproduced by any user with the following program compiled with -lkeyutils: #include #include #include static char payload[] = "#dnserror=6"; int main() { key_serial_t key; key = add_key("dns_resolver", "a", payload, sizeof(payload), KEY_SPEC_SESSION_KEYRING); if (key == -1) err(1, "add_key"); if (keyctl_read(key, NULL, 0) == -1) err(1, "read_key"); return 0; } What should happen is that keyctl_read() reports error 6 (ENXIO) to the user: dns-break: read_key: No such device or address but instead the kernel oopses. This cannot be reproduced with the 'keyutils add' or 'keyutils padd' commands as both of those cut the data down below the NUL termination that must be included in the data. Without this dns_resolver_instantiate() will return -EINVAL and the key will not be instantiated such that it can be read. The oops looks like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] user_read+0x4f/0x8f PGD 3bdf8067 PUD 385b9067 PMD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/pci0000:00/0000:00:19.0/irq CPU 0 Modules linked in: Pid: 2150, comm: dns-break Not tainted 2.6.38-rc7-cachefs+ #468 /DG965RY RIP: 0010:[] [] user_read+0x4f/0x8f RSP: 0018:ffff88003bf47f08 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff88003b5ea378 RCX: ffffffff81972368 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88003b5ea378 RBP: ffff88003bf47f28 R08: ffff88003be56620 R09: 0000000000000000 R10: 0000000000000395 R11: 0000000000000002 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: ffffffffffffffa1 FS: 00007feab5751700(0000) GS:ffff88003e000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 000000003de40000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process dns-break (pid: 2150, threadinfo ffff88003bf46000, task ffff88003be56090) Stack: ffff88003b5ea378 ffff88003b5ea3a0 0000000000000000 0000000000000000 ffff88003bf47f68 ffffffff811b708e ffff88003c442bc8 0000000000000000 00000000004005a0 00007fffba368060 0000000000000000 0000000000000000 Call Trace: [] keyctl_read_key+0xac/0xcf [] sys_keyctl+0x75/0xb6 [] system_call_fastpath+0x16/0x1b Code: 75 1f 48 83 7b 28 00 75 18 c6 05 58 2b fb 00 01 be bb 00 00 00 48 c7 c7 76 1c 75 81 e8 13 c2 e9 ff 4c 8b b3 e0 00 00 00 4d 85 ed <41> 0f b7 5e 10 74 2d 4d 85 e4 74 28 e8 98 79 ee ff 49 39 dd 48 RIP [] user_read+0x4f/0x8f RSP CR2: 0000000000000010 Signed-off-by: David Howells Acked-by: Jeff Layton cc: Wang Lei Signed-off-by: James Morris --- net/dns_resolver/dns_key.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 739435a6af39..cfa7a5e1c5c9 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -67,8 +67,9 @@ dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen) size_t result_len = 0; const char *data = _data, *end, *opt; - kenter("%%%d,%s,'%s',%zu", - key->serial, key->description, data, datalen); + kenter("%%%d,%s,'%*.*s',%zu", + key->serial, key->description, + (int)datalen, (int)datalen, data, datalen); if (datalen <= 1 || !data || data[datalen - 1] != '\0') return -EINVAL; @@ -217,6 +218,19 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m) seq_printf(m, ": %u", key->datalen); } +/* + * read the DNS data + * - the key's semaphore is read-locked + */ +static long dns_resolver_read(const struct key *key, + char __user *buffer, size_t buflen) +{ + if (key->type_data.x[0]) + return key->type_data.x[0]; + + return user_read(key, buffer, buflen); +} + struct key_type key_type_dns_resolver = { .name = "dns_resolver", .instantiate = dns_resolver_instantiate, @@ -224,7 +238,7 @@ struct key_type key_type_dns_resolver = { .revoke = user_revoke, .destroy = user_destroy, .describe = dns_resolver_describe, - .read = user_read, + .read = dns_resolver_read, }; static int __init init_dns_resolver(void) -- cgit v1.2.2 From 60bf8bf8815e6adea4c1d0423578c3b8000e2ec8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Mar 2011 12:24:28 -0800 Subject: libceph: fix msgr backoff With commit f363e45f we replaced a bunch of hacky workqueue mutual exclusion logic with the WQ_NON_REENTRANT flag. One pieces of fallout is that the exponential backoff breaks in certain cases: * con_work attempts to connect. * we get an immediate failure, and the socket state change handler queues immediate work. * con_work calls con_fault, we decide to back off, but can't queue delayed work. In this case, we add a BACKOFF bit to make con_work reschedule delayed work next time it runs (which should be immediately). Signed-off-by: Sage Weil --- net/ceph/messenger.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6bd5025f6220..46fbc422ba74 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1949,6 +1949,19 @@ static void con_work(struct work_struct *work) work.work); mutex_lock(&con->mutex); + if (test_and_clear_bit(BACKOFF, &con->state)) { + dout("con_work %p backing off\n", con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay))) { + dout("con_work %p backoff %lu\n", con, con->delay); + mutex_unlock(&con->mutex); + return; + } else { + con->ops->put(con); + dout("con_work %p FAILED to back off %lu\n", con, + con->delay); + } + } if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); @@ -2017,11 +2030,24 @@ static void ceph_fault(struct ceph_connection *con) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; - dout("fault queueing %p delay %lu\n", con, con->delay); con->ops->get(con); if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay)) == 0) + round_jiffies_relative(con->delay))) { + dout("fault queued %p delay %lu\n", con, con->delay); + } else { con->ops->put(con); + dout("fault failed to queue %p delay %lu, backoff\n", + con, con->delay); + /* + * In many cases we see a socket state change + * while con_work is running and end up + * queuing (non-delayed) work, such that we + * can't backoff with a delay. Set a flag so + * that when con_work restarts we schedule the + * delay then. + */ + set_bit(BACKOFF, &con->state); + } } out_unlock: -- cgit v1.2.2 From e76661d0a59e53e5cc4dccbe4b755d1dc8a968ec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Mar 2011 10:10:15 -0800 Subject: libceph: fix msgr keepalive flag There was some broken keepalive code using a dead variable. Shift to using the proper bit flag. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 46fbc422ba74..3252ea974e8f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -336,7 +336,6 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } - con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -2019,10 +2018,10 @@ static void ceph_fault(struct ceph_connection *con) /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); - /* If there are no messages in the queue, place the connection - * in a STANDBY state (i.e., don't try to reconnect just yet). */ - if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { - dout("fault setting STANDBY\n"); + /* If there are no messages queued or keepalive pending, place + * the connection in a STANDBY state */ + if (list_empty(&con->out_queue) && + !test_bit(KEEPALIVE_PENDING, &con->state)) { set_bit(STANDBY, &con->state); } else { /* retry after a delay. */ -- cgit v1.2.2 From e00de341fdb76c955703b4438100f9933c452b7f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Mar 2011 12:25:05 -0800 Subject: libceph: fix msgr standby handling The standby logic used to be pretty dependent on the work requeueing behavior that changed when we switched to WQ_NON_REENTRANT. It was also very fragile. Restructure things so that: - We clear WRITE_PENDING when we set STANDBY. This ensures we will requeue work when we wake up later. - con_work backs off if STANDBY is set. There is nothing to do if we are in standby. - clear_standby() helper is called by both con_send() and con_keepalive(), the two actions that can wake us up again. Move the connect_seq++ logic here. Signed-off-by: Sage Weil --- net/ceph/messenger.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3252ea974e8f..05f357828a2f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1712,14 +1712,6 @@ more: /* open the socket first? */ if (con->sock == NULL) { - /* - * if we were STANDBY and are reconnecting _this_ - * connection, bump connect_seq now. Always bump - * global_seq. - */ - if (test_and_clear_bit(STANDBY, &con->state)) - con->connect_seq++; - prepare_write_banner(msgr, con); prepare_write_connect(msgr, con, 1); prepare_read_banner(con); @@ -1962,6 +1954,10 @@ static void con_work(struct work_struct *work) } } + if (test_bit(STANDBY, &con->state)) { + dout("con_work %p STANDBY\n", con); + goto done; + } if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -2022,6 +2018,8 @@ static void ceph_fault(struct ceph_connection *con) * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && !test_bit(KEEPALIVE_PENDING, &con->state)) { + dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); + clear_bit(WRITE_PENDING, &con->state); set_bit(STANDBY, &con->state); } else { /* retry after a delay. */ @@ -2117,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr) } EXPORT_SYMBOL(ceph_messenger_destroy); +static void clear_standby(struct ceph_connection *con) +{ + /* come back from STANDBY? */ + if (test_and_clear_bit(STANDBY, &con->state)) { + mutex_lock(&con->mutex); + dout("clear_standby %p and ++connect_seq\n", con); + con->connect_seq++; + WARN_ON(test_bit(WRITE_PENDING, &con->state)); + WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); + mutex_unlock(&con->mutex); + } +} + /* * Queue up an outgoing message on the given connection. */ @@ -2149,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ + clear_standby(con); if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } @@ -2214,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) */ void ceph_con_keepalive(struct ceph_connection *con) { + dout("con_keepalive %p\n", con); + clear_standby(con); if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); -- cgit v1.2.2