aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeil Horman <nhorman@tuxdriver.com>2011-05-25 04:13:01 -0400
committerDavid S. Miller <davem@davemloft.net>2011-05-25 17:55:33 -0400
commit9fe0617d9b6d21f700ee9e658e1c9fe3be2fb402 (patch)
treed1e3c425be5697479514792e8183fc1c9c6a32c4
parent2907c35ff64708065e5a7fd54e8ded8263eb3074 (diff)
bonding: prevent deadlock on slave store with alb mode (v3)
This soft lockup was recently reported: [root@dell-per715-01 ~]# echo +bond5 > /sys/class/net/bonding_masters [root@dell-per715-01 ~]# echo +eth1 > /sys/class/net/bond5/bonding/slaves bonding: bond5: doing slave updates when interface is down. bonding bond5: master_dev is not up in bond_enslave [root@dell-per715-01 ~]# echo -eth1 > /sys/class/net/bond5/bonding/slaves bonding: bond5: doing slave updates when interface is down. BUG: soft lockup - CPU#12 stuck for 60s! [bash:6444] CPU 12: Modules linked in: bonding autofs4 hidp rfcomm l2cap bluetooth lockd sunrpc be2d Pid: 6444, comm: bash Not tainted 2.6.18-262.el5 #1 RIP: 0010:[<ffffffff80064bf0>] [<ffffffff80064bf0>] .text.lock.spinlock+0x26/00 RSP: 0018:ffff810113167da8 EFLAGS: 00000286 RAX: ffff810113167fd8 RBX: ffff810123a47800 RCX: 0000000000ff1025 RDX: 0000000000000000 RSI: ffff810123a47800 RDI: ffff81021b57f6f8 RBP: ffff81021b57f500 R08: 0000000000000000 R09: 000000000000000c R10: 00000000ffffffff R11: ffff81011d41c000 R12: ffff81021b57f000 R13: 0000000000000000 R14: 0000000000000282 R15: 0000000000000282 FS: 00002b3b41ef3f50(0000) GS:ffff810123b27940(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00002b3b456dd000 CR3: 000000031fc60000 CR4: 00000000000006e0 Call Trace: [<ffffffff80064af9>] _spin_lock_bh+0x9/0x14 [<ffffffff886937d7>] :bonding:tlb_clear_slave+0x22/0xa1 [<ffffffff8869423c>] :bonding:bond_alb_deinit_slave+0xba/0xf0 [<ffffffff8868dda6>] :bonding:bond_release+0x1b4/0x450 [<ffffffff8006457b>] __down_write_nested+0x12/0x92 [<ffffffff88696ae4>] :bonding:bonding_store_slaves+0x25c/0x2f7 [<ffffffff801106f7>] sysfs_write_file+0xb9/0xe8 [<ffffffff80016b87>] vfs_write+0xce/0x174 [<ffffffff80017450>] sys_write+0x45/0x6e [<ffffffff8005d28d>] tracesys+0xd5/0xe0 It occurs because we are able to change the slave configuarion of a bond while the bond interface is down. The bonding driver initializes some data structures only after its ndo_open routine is called. Among them is the initalization of the alb tx and rx hash locks. So if we add or remove a slave without first opening the bond master device, we run the risk of trying to lock/unlock a spinlock that has garbage for data in it, which results in our above softlock. Note that sometimes this works, because in many cases an unlocked spinlock has the raw_lock parameter initialized to zero (meaning that the kzalloc of the net_device private data is equivalent to calling spin_lock_init), but thats not true in all cases, and we aren't guaranteed that condition, so we need to pass the relevant spinlocks through the spin_lock_init function. Fix it by moving the spin_lock_init calls for the tx and rx hashtable locks to the ndo_init path, so they are ready for use by the bond_store_slaves path. Change notes: v2) Based on conversation with Jay and Nicolas it seems that the ability to enslave devices while the bond master is down should be safe to do. As such this is an outlier bug, and so instead we'll just initalize the errant spinlocks in the init path rather than the open path, solving the problem. We'll also remove the warnings about the bond being down during enslave operations, since it should be safe v3) Fix spelling error Signed-off-by: Neil Horman <nhorman@tuxdriver.com> Reported-by: jtluka@redhat.com CC: Jay Vosburgh <fubar@us.ibm.com> CC: Andy Gospodarek <andy@greyhouse.net> CC: nicolas.2p.debian@gmail.com CC: "David S. Miller" <davem@davemloft.net> Signed-off-by: Jay Vosburgh <fubar@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/bonding/bond_alb.c4
-rw-r--r--drivers/net/bonding/bond_main.c16
-rw-r--r--drivers/net/bonding/bond_sysfs.c6
3 files changed, 10 insertions, 16 deletions
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 8f2d2e7c70e5..2df9276720a0 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -163,8 +163,6 @@ static int tlb_initialize(struct bonding *bond)
163 struct tlb_client_info *new_hashtbl; 163 struct tlb_client_info *new_hashtbl;
164 int i; 164 int i;
165 165
166 spin_lock_init(&(bond_info->tx_hashtbl_lock));
167
168 new_hashtbl = kzalloc(size, GFP_KERNEL); 166 new_hashtbl = kzalloc(size, GFP_KERNEL);
169 if (!new_hashtbl) { 167 if (!new_hashtbl) {
170 pr_err("%s: Error: Failed to allocate TLB hash table\n", 168 pr_err("%s: Error: Failed to allocate TLB hash table\n",
@@ -747,8 +745,6 @@ static int rlb_initialize(struct bonding *bond)
747 int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info); 745 int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);
748 int i; 746 int i;
749 747
750 spin_lock_init(&(bond_info->rx_hashtbl_lock));
751
752 new_hashtbl = kmalloc(size, GFP_KERNEL); 748 new_hashtbl = kmalloc(size, GFP_KERNEL);
753 if (!new_hashtbl) { 749 if (!new_hashtbl) {
754 pr_err("%s: Error: Failed to allocate RLB hash table\n", 750 pr_err("%s: Error: Failed to allocate RLB hash table\n",
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 6dc428461541..9ec4a505a79f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1542,12 +1542,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
1542 bond_dev->name, slave_dev->name); 1542 bond_dev->name, slave_dev->name);
1543 } 1543 }
1544 1544
1545 /* bond must be initialized by bond_open() before enslaving */
1546 if (!(bond_dev->flags & IFF_UP)) {
1547 pr_warning("%s: master_dev is not up in bond_enslave\n",
1548 bond_dev->name);
1549 }
1550
1551 /* already enslaved */ 1545 /* already enslaved */
1552 if (slave_dev->flags & IFF_SLAVE) { 1546 if (slave_dev->flags & IFF_SLAVE) {
1553 pr_debug("Error, Device was already enslaved\n"); 1547 pr_debug("Error, Device was already enslaved\n");
@@ -4834,9 +4828,19 @@ static int bond_init(struct net_device *bond_dev)
4834{ 4828{
4835 struct bonding *bond = netdev_priv(bond_dev); 4829 struct bonding *bond = netdev_priv(bond_dev);
4836 struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); 4830 struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
4831 struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
4837 4832
4838 pr_debug("Begin bond_init for %s\n", bond_dev->name); 4833 pr_debug("Begin bond_init for %s\n", bond_dev->name);
4839 4834
4835 /*
4836 * Initialize locks that may be required during
4837 * en/deslave operations. All of the bond_open work
4838 * (of which this is part) should really be moved to
4839 * a phase prior to dev_open
4840 */
4841 spin_lock_init(&(bond_info->tx_hashtbl_lock));
4842 spin_lock_init(&(bond_info->rx_hashtbl_lock));
4843
4840 bond->wq = create_singlethread_workqueue(bond_dev->name); 4844 bond->wq = create_singlethread_workqueue(bond_dev->name);
4841 if (!bond->wq) 4845 if (!bond->wq)
4842 return -ENOMEM; 4846 return -ENOMEM;
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 4059bfc73dbf..bb1319f9f173 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -227,12 +227,6 @@ static ssize_t bonding_store_slaves(struct device *d,
227 struct net_device *dev; 227 struct net_device *dev;
228 struct bonding *bond = to_bond(d); 228 struct bonding *bond = to_bond(d);
229 229
230 /* Quick sanity check -- is the bond interface up? */
231 if (!(bond->dev->flags & IFF_UP)) {
232 pr_warning("%s: doing slave updates when interface is down.\n",
233 bond->dev->name);
234 }
235
236 if (!rtnl_trylock()) 230 if (!rtnl_trylock())
237 return restart_syscall(); 231 return restart_syscall();
238 232