diff options
author | dingtianhong <dingtianhong@huawei.com> | 2014-02-25 22:05:23 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-02-26 16:02:56 -0500 |
commit | b0929915e0356acedf59504521c097ecada88b19 (patch) | |
tree | dc6e143976647fb829bd75bdaeeffb9d8b201014 /drivers/net/bonding | |
parent | 5e5b066535f0ee58e5de3a2db5fb56fa3cd7e3b1 (diff) |
bonding: Fix RTNL: assertion failed at net/core/rtnetlink.c for ab arp monitor
Veaceslav has reported and fix this problem by commit f2ebd477f141bc0
(bonding: restructure locking of bond_ab_arp_probe()). According Jay's
opinion, the current solution is not very well, because the notification
is to indicate that the interface has actually changed state in a meaningful
way, but these calls in the ab ARP monitor are internal settings of the flags
to allow the ARP monitor to search for a slave to become active when there are
no active slaves. The flag setting to active or backup is to permit the ARP
monitor's response logic to do the right thing when deciding if the test
slave (current_arp_slave) is up or not.
So the best way to fix the problem is that we should not send a notification
when the slave is in testing state, and check the state at the end of the
monitor, if the slave's state recover, avoid to send pointless notification
twice. And RTNL is really a big lock, hold it regardless the slave's state
changed or not when the current_active_slave is null will loss performance
(every 100ms), so we should hold it only when the slave's state changed and
need to notify.
I revert the old commit and add new modifications.
Cc: Jay Vosburgh <fubar@us.ibm.com>
Cc: Veaceslav Falico <vfalico@redhat.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/bonding')
-rw-r--r-- | drivers/net/bonding/bond_3ad.c | 8 | ||||
-rw-r--r-- | drivers/net/bonding/bond_main.c | 80 | ||||
-rw-r--r-- | drivers/net/bonding/bonding.h | 13 |
3 files changed, 55 insertions, 46 deletions
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 6826e4f61060..dcde56057fe1 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c | |||
@@ -2130,13 +2130,7 @@ re_arm: | |||
2130 | read_unlock(&bond->lock); | 2130 | read_unlock(&bond->lock); |
2131 | 2131 | ||
2132 | if (should_notify_rtnl && rtnl_trylock()) { | 2132 | if (should_notify_rtnl && rtnl_trylock()) { |
2133 | bond_for_each_slave(bond, slave, iter) { | 2133 | bond_slave_state_notify(bond); |
2134 | if (slave->should_notify) { | ||
2135 | rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, | ||
2136 | GFP_KERNEL); | ||
2137 | slave->should_notify = 0; | ||
2138 | } | ||
2139 | } | ||
2140 | rtnl_unlock(); | 2134 | rtnl_unlock(); |
2141 | } | 2135 | } |
2142 | queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks); | 2136 | queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks); |
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e02029bbf5cc..82b70ff1fd28 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c | |||
@@ -2623,17 +2623,17 @@ do_failover: | |||
2623 | 2623 | ||
2624 | /* | 2624 | /* |
2625 | * Send ARP probes for active-backup mode ARP monitor. | 2625 | * Send ARP probes for active-backup mode ARP monitor. |
2626 | * | ||
2627 | * Called with rcu_read_lock hold. | ||
2626 | */ | 2628 | */ |
2627 | static bool bond_ab_arp_probe(struct bonding *bond) | 2629 | static bool bond_ab_arp_probe(struct bonding *bond) |
2628 | { | 2630 | { |
2629 | struct slave *slave, *before = NULL, *new_slave = NULL, | 2631 | struct slave *slave, *before = NULL, *new_slave = NULL, |
2630 | *curr_arp_slave, *curr_active_slave; | 2632 | *curr_arp_slave = rcu_dereference(bond->current_arp_slave), |
2633 | *curr_active_slave = rcu_dereference(bond->curr_active_slave); | ||
2631 | struct list_head *iter; | 2634 | struct list_head *iter; |
2632 | bool found = false; | 2635 | bool found = false; |
2633 | 2636 | bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; | |
2634 | rcu_read_lock(); | ||
2635 | curr_arp_slave = rcu_dereference(bond->current_arp_slave); | ||
2636 | curr_active_slave = rcu_dereference(bond->curr_active_slave); | ||
2637 | 2637 | ||
2638 | if (curr_arp_slave && curr_active_slave) | 2638 | if (curr_arp_slave && curr_active_slave) |
2639 | pr_info("PROBE: c_arp %s && cas %s BAD\n", | 2639 | pr_info("PROBE: c_arp %s && cas %s BAD\n", |
@@ -2642,32 +2642,23 @@ static bool bond_ab_arp_probe(struct bonding *bond) | |||
2642 | 2642 | ||
2643 | if (curr_active_slave) { | 2643 | if (curr_active_slave) { |
2644 | bond_arp_send_all(bond, curr_active_slave); | 2644 | bond_arp_send_all(bond, curr_active_slave); |
2645 | rcu_read_unlock(); | 2645 | return should_notify_rtnl; |
2646 | return true; | ||
2647 | } | 2646 | } |
2648 | rcu_read_unlock(); | ||
2649 | 2647 | ||
2650 | /* if we don't have a curr_active_slave, search for the next available | 2648 | /* if we don't have a curr_active_slave, search for the next available |
2651 | * backup slave from the current_arp_slave and make it the candidate | 2649 | * backup slave from the current_arp_slave and make it the candidate |
2652 | * for becoming the curr_active_slave | 2650 | * for becoming the curr_active_slave |
2653 | */ | 2651 | */ |
2654 | 2652 | ||
2655 | if (!rtnl_trylock()) | ||
2656 | return false; | ||
2657 | /* curr_arp_slave might have gone away */ | ||
2658 | curr_arp_slave = ACCESS_ONCE(bond->current_arp_slave); | ||
2659 | |||
2660 | if (!curr_arp_slave) { | 2653 | if (!curr_arp_slave) { |
2661 | curr_arp_slave = bond_first_slave(bond); | 2654 | curr_arp_slave = bond_first_slave_rcu(bond); |
2662 | if (!curr_arp_slave) { | 2655 | if (!curr_arp_slave) |
2663 | rtnl_unlock(); | 2656 | return should_notify_rtnl; |
2664 | return true; | ||
2665 | } | ||
2666 | } | 2657 | } |
2667 | 2658 | ||
2668 | bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_NOW); | 2659 | bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_LATER); |
2669 | 2660 | ||
2670 | bond_for_each_slave(bond, slave, iter) { | 2661 | bond_for_each_slave_rcu(bond, slave, iter) { |
2671 | if (!found && !before && IS_UP(slave->dev)) | 2662 | if (!found && !before && IS_UP(slave->dev)) |
2672 | before = slave; | 2663 | before = slave; |
2673 | 2664 | ||
@@ -2686,7 +2677,7 @@ static bool bond_ab_arp_probe(struct bonding *bond) | |||
2686 | slave->link_failure_count++; | 2677 | slave->link_failure_count++; |
2687 | 2678 | ||
2688 | bond_set_slave_inactive_flags(slave, | 2679 | bond_set_slave_inactive_flags(slave, |
2689 | BOND_SLAVE_NOTIFY_NOW); | 2680 | BOND_SLAVE_NOTIFY_LATER); |
2690 | 2681 | ||
2691 | pr_info("%s: backup interface %s is now down.\n", | 2682 | pr_info("%s: backup interface %s is now down.\n", |
2692 | bond->dev->name, slave->dev->name); | 2683 | bond->dev->name, slave->dev->name); |
@@ -2698,26 +2689,31 @@ static bool bond_ab_arp_probe(struct bonding *bond) | |||
2698 | if (!new_slave && before) | 2689 | if (!new_slave && before) |
2699 | new_slave = before; | 2690 | new_slave = before; |
2700 | 2691 | ||
2701 | if (!new_slave) { | 2692 | if (!new_slave) |
2702 | rtnl_unlock(); | 2693 | goto check_state; |
2703 | return true; | ||
2704 | } | ||
2705 | 2694 | ||
2706 | new_slave->link = BOND_LINK_BACK; | 2695 | new_slave->link = BOND_LINK_BACK; |
2707 | bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); | 2696 | bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER); |
2708 | bond_arp_send_all(bond, new_slave); | 2697 | bond_arp_send_all(bond, new_slave); |
2709 | new_slave->jiffies = jiffies; | 2698 | new_slave->jiffies = jiffies; |
2710 | rcu_assign_pointer(bond->current_arp_slave, new_slave); | 2699 | rcu_assign_pointer(bond->current_arp_slave, new_slave); |
2711 | rtnl_unlock(); | ||
2712 | 2700 | ||
2713 | return true; | 2701 | check_state: |
2702 | bond_for_each_slave_rcu(bond, slave, iter) { | ||
2703 | if (slave->should_notify) { | ||
2704 | should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; | ||
2705 | break; | ||
2706 | } | ||
2707 | } | ||
2708 | return should_notify_rtnl; | ||
2714 | } | 2709 | } |
2715 | 2710 | ||
2716 | static void bond_activebackup_arp_mon(struct work_struct *work) | 2711 | static void bond_activebackup_arp_mon(struct work_struct *work) |
2717 | { | 2712 | { |
2718 | struct bonding *bond = container_of(work, struct bonding, | 2713 | struct bonding *bond = container_of(work, struct bonding, |
2719 | arp_work.work); | 2714 | arp_work.work); |
2720 | bool should_notify_peers = false, should_commit = false; | 2715 | bool should_notify_peers = false; |
2716 | bool should_notify_rtnl = false; | ||
2721 | int delta_in_ticks; | 2717 | int delta_in_ticks; |
2722 | 2718 | ||
2723 | delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); | 2719 | delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); |
@@ -2726,11 +2722,12 @@ static void bond_activebackup_arp_mon(struct work_struct *work) | |||
2726 | goto re_arm; | 2722 | goto re_arm; |
2727 | 2723 | ||
2728 | rcu_read_lock(); | 2724 | rcu_read_lock(); |
2725 | |||
2729 | should_notify_peers = bond_should_notify_peers(bond); | 2726 | should_notify_peers = bond_should_notify_peers(bond); |
2730 | should_commit = bond_ab_arp_inspect(bond); | ||
2731 | rcu_read_unlock(); | ||
2732 | 2727 | ||
2733 | if (should_commit) { | 2728 | if (bond_ab_arp_inspect(bond)) { |
2729 | rcu_read_unlock(); | ||
2730 | |||
2734 | /* Race avoidance with bond_close flush of workqueue */ | 2731 | /* Race avoidance with bond_close flush of workqueue */ |
2735 | if (!rtnl_trylock()) { | 2732 | if (!rtnl_trylock()) { |
2736 | delta_in_ticks = 1; | 2733 | delta_in_ticks = 1; |
@@ -2739,23 +2736,28 @@ static void bond_activebackup_arp_mon(struct work_struct *work) | |||
2739 | } | 2736 | } |
2740 | 2737 | ||
2741 | bond_ab_arp_commit(bond); | 2738 | bond_ab_arp_commit(bond); |
2739 | |||
2742 | rtnl_unlock(); | 2740 | rtnl_unlock(); |
2741 | rcu_read_lock(); | ||
2743 | } | 2742 | } |
2744 | 2743 | ||
2745 | if (!bond_ab_arp_probe(bond)) { | 2744 | should_notify_rtnl = bond_ab_arp_probe(bond); |
2746 | /* rtnl locking failed, re-arm */ | 2745 | rcu_read_unlock(); |
2747 | delta_in_ticks = 1; | ||
2748 | should_notify_peers = false; | ||
2749 | } | ||
2750 | 2746 | ||
2751 | re_arm: | 2747 | re_arm: |
2752 | if (bond->params.arp_interval) | 2748 | if (bond->params.arp_interval) |
2753 | queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks); | 2749 | queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks); |
2754 | 2750 | ||
2755 | if (should_notify_peers) { | 2751 | if (should_notify_peers || should_notify_rtnl) { |
2756 | if (!rtnl_trylock()) | 2752 | if (!rtnl_trylock()) |
2757 | return; | 2753 | return; |
2758 | call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); | 2754 | |
2755 | if (should_notify_peers) | ||
2756 | call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, | ||
2757 | bond->dev); | ||
2758 | if (should_notify_rtnl) | ||
2759 | bond_slave_state_notify(bond); | ||
2760 | |||
2759 | rtnl_unlock(); | 2761 | rtnl_unlock(); |
2760 | } | 2762 | } |
2761 | } | 2763 | } |
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index 9b280ac8c454..2b0fdec695f7 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h | |||
@@ -335,6 +335,19 @@ static inline void bond_slave_state_change(struct bonding *bond) | |||
335 | } | 335 | } |
336 | } | 336 | } |
337 | 337 | ||
338 | static inline void bond_slave_state_notify(struct bonding *bond) | ||
339 | { | ||
340 | struct list_head *iter; | ||
341 | struct slave *tmp; | ||
342 | |||
343 | bond_for_each_slave(bond, tmp, iter) { | ||
344 | if (tmp->should_notify) { | ||
345 | rtmsg_ifinfo(RTM_NEWLINK, tmp->dev, 0, GFP_KERNEL); | ||
346 | tmp->should_notify = 0; | ||
347 | } | ||
348 | } | ||
349 | } | ||
350 | |||
338 | static inline int bond_slave_state(struct slave *slave) | 351 | static inline int bond_slave_state(struct slave *slave) |
339 | { | 352 | { |
340 | return slave->backup; | 353 | return slave->backup; |