/*
* originally based on the dummy device.
*
* Copyright 1999, Thomas Davis, tadavis@lbl.gov.
* Licensed under the GPL. Based on dummy.c, and eql.c devices.
*
* bonding.c: an Ethernet Bonding driver
*
* This is useful to talk to a Cisco EtherChannel compatible equipment:
* Cisco 5500
* Sun Trunking (Solaris)
* Alteon AceDirector Trunks
* Linux Bonding
* and probably many L2 switches ...
*
* How it works:
* ifconfig bond0 ipaddress netmask up
* will setup a network device, with an ip address. No mac address
* will be assigned at this time. The hw mac address will come from
* the first slave bonded to the channel. All slaves will then use
* this hw mac address.
*
* ifconfig bond0 down
* will release all slaves, marking them as down.
*
* ifenslave bond0 eth0
* will attach eth0 to bond0 as a slave. eth0 hw mac address will either
* a: be used as initial mac address
* b: if a hw mac address already is there, eth0's hw mac address
* will then be set from bond0.
*
*/
//#define BONDING_DEBUG 1
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/interrupt.h>
#include <linux/ptrace.h>
#include <linux/ioport.h>
#include <linux/in.h>
#include <net/ip.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/socket.h>
#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/bitops.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/dma.h>
#include <asm/uaccess.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/if_ether.h>
#include <net/arp.h>
#include <linux/mii.h>
#include <linux/ethtool.h>
#include <linux/if_vlan.h>
#include <linux/if_bonding.h>
#include <net/route.h>
#include "bonding.h"
#include "bond_3ad.h"
#include "bond_alb.h"
/*---------------------------- Module parameters ----------------------------*/
/* monitor all links that often (in milliseconds). <=0 disables monitoring */
#define BOND_LINK_MON_INTERV 0
#define BOND_LINK_ARP_INTERV 0
static int max_bonds = BOND_DEFAULT_MAX_BONDS;
static int miimon = BOND_LINK_MON_INTERV;
static int updelay = 0;
static int downdelay = 0;
static int use_carrier = 1;
static char *mode = NULL;
static char *primary = NULL;
static char *lacp_rate = NULL;
static char *xmit_hash_policy = NULL;
static int arp_interval = BOND_LINK_ARP_INTERV;
static char *arp_ip_target[BOND_MAX_ARP_TARGETS] = { NULL, };
static char *arp_validate = NULL;
struct bond_params bonding_defaults;
module_param(max_bonds, int, 0);
MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
module_param(miimon, int, 0);
MODULE_PARM_DESC(miimon, "Link check interval in milliseconds");
module_param(updelay, int, 0);
MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds");
module_param(downdelay, int, 0);
MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
"in milliseconds");
module_param(use_carrier, int, 0);
MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
"0 for off, 1 for on (default)");
module_param(mode, charp, 0);
MODULE_PARM_DESC(mode, "Mode of operation : 0 for balance-rr, "
"1 for active-backup, 2 for balance-xor, "
"3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, "
"6 for balance-alb");
module_param(primary, charp, 0);
MODULE_PARM_DESC(primary, "Primary network device to use");
module_param(lacp_rate, charp, 0);
MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner "
"(slow/fast)");
module_param(xmit_hash_policy, charp, 0);
MODULE_PARM_DESC(xmit_hash_policy, "XOR hashing method: 0 for layer 2 (default)"
", 1 for layer 3+4");
module_param(arp_interval, int, 0);
MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
module_param_array(arp_ip_target, charp, NULL, 0);
MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form");
module_param(arp_validate, charp, 0);
MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes: none (default), active, backup or all");
/*----------------------------- Global variables ----------------------------*/
static const char * const version =
DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n";
LIST_HEAD(bond_dev_list);
#ifdef CONFIG_PROC_FS
static struct proc_dir_entry *bond_proc_dir = NULL;
#endif
extern struct rw_semaphore bonding_rwsem;
static u32 arp_target[BOND_MAX_ARP_TARGETS] = { 0, } ;
static int arp_ip_count = 0;
static int bond_mode = BOND_MODE_ROUNDROBIN;
static int xmit_hashtype= BOND_XMIT_POLICY_LAYER2;
static int lacp_fast = 0;
struct bond_parm_tbl bond_lacp_tbl[] = {
{ "slow", AD_LACP_SLOW},
{ "fast", AD_LACP_FAST},
{ NULL, -1},
};
struct bond_parm_tbl bond_mode_tbl[] = {
{ "balance-rr", BOND_MODE_ROUNDROBIN},
{ "active-backup", BOND_MODE_ACTIVEBACKUP},
{ "balance-xor", BOND_MODE_XOR},
{ "broadcast", BOND_MODE_BROADCAST},
{ "802.3ad", BOND_MODE_8023AD},
{ "balance-tlb", BOND_MODE_TLB},
{ "balance-alb", BOND_MODE_ALB},
{ NULL, -1},
};
struct bond_parm_tbl xmit_hashtype_tbl[] = {
{ "layer2", BOND_XMIT_POLICY_LAYER2},
{ "layer3+4", BOND_XMIT_POLICY_LAYER34},
{ NULL, -1},
};
struct bond_parm_tbl arp_validate_tbl[] = {
{ "none", BOND_ARP_VALIDATE_NONE},
{ "active", BOND_ARP_VALIDATE_ACTIVE},
{ "backup", BOND_ARP_VALIDATE_BACKUP},
{ "all", BOND_ARP_VALIDATE_ALL},
{ NULL, -1},
};
/*-------------------------- Forward declarations ---------------------------*/
static void bond_send_gratuitous_arp(struct bonding *bond);
/*---------------------------- General routines -----------------------------*/
const char *bond_mode_name(int mode)
{
switch (mode) {
case BOND_MODE_ROUNDROBIN :
return "load balancing (round-robin)";
case BOND_MODE_ACTIVEBACKUP :
return "fault-tolerance (active-backup)";
case BOND_MODE_XOR :
return "load balancing (xor)";
case BOND_MODE_BROADCAST :
return "fault-tolerance (broadcast)";
case BOND_MODE_8023AD:
return "IEEE 802.3ad Dynamic link aggregation";
case BOND_MODE_TLB:
return "transmit load balancing";
case BOND_MODE_ALB:
return "adaptive load balancing";
default:
return "unknown";
}
}
/*---------------------------------- VLAN -----------------------------------*/
/**
* bond_add_vlan - add a new vlan id on bond
* @bond: bond that got the notification
* @vlan_id: the vlan id to add
*
* Returns -ENOMEM if allocation failed.
*/
static int bond_add_vlan(struct bonding *bond, unsigned short vlan_id)
{
struct vlan_entry *vlan;
dprintk("bond: %s, vlan id %d\n",
(bond ? bond->dev->name: "None"), vlan_id);
vlan = kmalloc(sizeof(struct vlan_entry), GFP_KERNEL);
if (!vlan) {
return -ENOMEM;
}
INIT_LIST_HEAD(&vlan->vlan_list);
vlan->vlan_id = vlan_id;
vlan->vlan_ip = 0;
write_lock_bh(&bond->lock);
list_add_tail(&vlan->vlan_list, &bond->vlan_list);
write_unlock_bh(&bond->lock);
dprintk("added VLAN ID %d on bond %s\n", vlan_id, bond->dev->name);
return 0;
}
/**
* bond_del_vlan - delete a vlan id from bond
* @bond: bond that got the notification
* @vlan_id: the vlan id to delete
*
* returns -ENODEV if @vlan_id was not found in @bond.
*/
static int bond_del_vlan(struct bonding *bond, unsigned short vlan_id)
{
struct vlan_entry *vlan, *next;
int res = -ENODEV;
dprintk("bond: %s, vlan id %d\n", bond->dev->name, vlan_id);
write_lock_bh(&bond->lock);
list_for_each_entry_safe(vlan, next, &bond->vlan_list, vlan_list) {
if (vlan->vlan_id == vlan_id) {
list_del(&vlan->vlan_list);
if ((bond->params.mode == BOND_MODE_TLB) ||
(bond->params.mode == BOND_MODE_ALB)) {
bond_alb_clear_vlan(bond, vlan_id);
}
dprintk("removed VLAN ID %d from bond %s\n", vlan_id,
bond->dev->name);
kfree(vlan);
if (list_empty(&bond->vlan_list) &&
(bond->slave_cnt == 0)) {
/* Last VLAN removed and no slaves, so
* restore block on adding VLANs. This will
* be removed once new slaves that are not
* VLAN challenged will be added.
*/
bond->dev->features |= NETIF_F_VLAN_CHALLENGED;
}
res = 0;
goto out;
}
}
dprintk("couldn't find VLAN ID %d in bond %s\n", vlan_id,
bond->dev->name);
out:
write_unlock_bh(&bond->lock);
return res;
}
/**
* bond_has_challenged_slaves
* @bond: the bond we're working on
*
* Searches the slave list. Returns 1 if a vlan challenged slave
* was found, 0 otherwise.
*
* Assumes bond->lock is held.
*/
static int bond_has_challenged_slaves(struct bonding *bond)
{
struct slave *slave;
int i;
bond_for_each_slave(bond, slave, i) {
if (slave->dev->features & NETIF_F_VLAN_CHALLENGED) {
dprintk("found VLAN challenged slave - %s\n",
slave->dev->name);
return 1;
}
}
dprintk("no VLAN challenged slaves found\n");
return 0;
}
/**
* bond_next_vlan - safely skip to the next item in the vlans list.
* @bond: the bond we're working on
* @curr: item we're advancing from
*
* Returns %NULL if list is empty, bond->next_vlan if @curr is %NULL,
* or @curr->next otherwise (even if it is @curr itself again).
*
* Caller must hold bond->lock
*/
struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr)
{
struct vlan_entry *next, *last;
if (list_empty(&bond->vlan_list)) {
return NULL;
}
if (!curr) {
next = list_entry(bond->vlan_list.next,
struct vlan_entry, vlan_list);
} else {
last = list_entry(bond->vlan_list.prev,
struct vlan_entry, vlan_list);
if (last == curr) {
next = list_entry(bond->vlan_list.next,
struct vlan_entry, vlan_list);
} else {
next = list_entry(curr->vlan_list.next,
struct vlan_entry, vlan_list);
}
}
return next;
}
/**
* bond_dev_queue_xmit - Prepare skb for xmit.
*
* @bond: bond device that got this skb for tx.
* @skb: hw accel VLAN tagged skb to transmit
* @slave_dev: slave that is supposed to xmit this skbuff
*
* When the bond gets an skb to transmit that is
* already hardware accelerated VLAN tagged, and it
* needs to relay this skb to a slave that is not
* hw accel capable, the skb needs to be "unaccelerated",
* i.e. strip the hwaccel tag and re-insert it as part
* of the payload.
*/
int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev)
{
unsigned short vlan_id;
if (!list_empty(&bond->vlan_list) &&
!(slave_dev->features & NETIF_F_HW_VLAN_TX) &&
vlan_get_tag(skb, &vlan_id) == 0) {
skb->dev = slave_dev;
skb = vlan_put_tag(skb, vlan_id);
if (!skb) {
/* vlan_put_tag() frees the skb in case of error,
* so return success here so the calling functions
* won't attempt to free is again.
*/
return 0;
}
} else {
skb->dev = slave_dev;
}
skb->priority = 1;
dev_queue_xmit(skb);
return 0;
}
/*
* In the following 3 functions, bond_vlan_rx_register(), bond_vlan_rx_add_vid
* and bond_vlan_rx_kill_vid, We don't protect the slave list iteration with a
* lock because:
* a. This operation is performed in IOCTL context,
* b. The operation is protected by the RTNL semaphore in the 8021q code,
* c. Holding a lock with BH disabled while directly calling a base driver
* entry point is generally a BAD idea.
*
* The design of synchronization/protection for this operation in the 8021q
* module is good for one or more VLAN devices over a single physical device
* and cannot be extended for a teaming solution like bonding, so there is a
* potential race condition here where a net device from the vlan group might
* be referenced (either by a base driver or the 8021q code) while it is being
* removed from the system. However, it turns out we're not making matters
* worse, and if it works for regular VLAN usage it will work here too.
*/
/**
* bond_vlan_rx_register - Propagates registration to slaves
* @bond_dev: bonding net device that got called
* @grp: vlan group being registered
*/
static void bond_vlan_rx_register(struct net_device *bond_dev, struct vlan_group *grp)
{
struct bonding *bond = bond_dev->priv;
struct slave *slave;
int i;
bond->vlgrp = grp;
bond_for_each_slave(bond, slave, i) {
struct net_device *slave_dev = slave->dev;
if ((slave_dev->features & NETIF_F_HW_VLAN_RX) &&
slave_dev->vlan_rx_register) {
slave_dev->vlan_rx_register(slave_dev, grp);
}
}
}
/**
* bond_vlan_rx_add_vid - Propagates adding an id to slaves
* @bond_dev: bonding net device that got called
* @vid: vlan id being added
*/
static void bond_vlan_rx_add_vid(struct net_device *bond_dev, uint16_t vid)
{
struct bonding *bond = bond_dev->priv;
struct slave *slave;
int i, res;
bond_for_each_slave(bond, slave, i) {
struct net_device *slave_dev = slave->dev;
if ((slave_dev->features & NETIF_F_HW_VLAN_FILTER) &&
slave_dev->vlan_rx_add_vid) {
slave_dev->vlan_rx_add_vid(slave_dev, vid);
}
}
res = bond_add_vlan(bond, vid);
if (res) {
printk(KERN_ERR DRV_NAME
": %s: Error: Failed to add vlan id %d\n",
bond_dev->name, vid);
}
}
/**
* bond_vlan_rx_kill_vid - Propagates deleting an id to slaves
* @bond_dev: bonding net device that got called
* @vid: vlan id being removed
*/
static void bond_vlan_rx_kill_vid(struct net_device *bond_dev, uint16_t vid)
{
struct bonding *bond = bond_dev->priv;
struct slave *slave;
struct net_device *vlan_dev;
int i, res;
bond_for_each_slave(bond, slave, i) {
struct net_device *slave_dev = slave->dev;
if ((slave_dev->features & NETIF_F_HW_VLAN_FILTER) &&
slave_dev->vlan_rx_kill_vid) {
/* Save and then restore vlan_dev in the grp array,
* since the slave's driver might clear it.
*/
vlan_dev = vlan_group_get_device(bond->vlgrp, vid);
slave_dev->vlan_rx_kill_vid(slave_dev, vid);
vlan_group_set_device(bond->vlgrp, vid, vlan_dev);
}
}
res = bond_del_vlan(bond, vid);
if (res) {
printk(KERN_ERR DRV_NAME
": %s: Error: Failed to remove vlan id %d\n",
bond_dev->name, vid);
}
}
static void bond_add_vlans_on_slave(struct bonding *bond, struct net_device *slave_dev)
{
struct vlan_entry *vlan;
write_lock_bh(&bond->lock);
if (list_empty(&bond->vlan_list)) {
goto out;
}
if ((slave_dev->features & NETIF_F_HW_VLAN_RX) &&
slave_dev->vlan_rx_register) {
slave_dev->vlan_rx_register(slave_dev, bond->vlgrp);
}
if (!(slave_dev->features & NETIF_F_HW_VLAN_FILTER) ||
!(slave_dev->vlan_rx_add_vid)) {
goto out;
}
list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
slave_dev->vlan_rx_add_vid(slave_dev, vlan->vlan_id);
}
out:
write_unlock_bh(&bond->lock);
}
static void bond_del_vlans_from_slave(struct bonding *bond, struct net_device *slave_dev)
{
struct vlan_entry *vlan;
struct net_device *vlan_dev;
write_lock_bh(&bond->lock);
if (list_empty(&bond->vlan_list)) {
goto out;
}
if (!(slave_dev->features & NETIF_F_HW_VLAN_FILTER) ||
!(slave_dev->vlan_rx_kill_vid)) {
goto unreg;
}
list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
/* Save and then restore vlan_dev in the grp array,
* since the slave's driver might clear it.
*/
vlan_dev = vlan_group_get_device(bond->vlgrp, vlan->vlan_id);
slave_dev->vlan_rx_kill_vid(slave_dev, vlan->vlan_id);
vlan_group_set_device(bond->vlgrp, vlan->vlan_id, vlan_dev);
}
unreg:
if ((slave_dev->features & NETIF_F_HW_VLAN_RX) &&
slave_dev->vlan_rx_register) {
slave_dev->vlan_rx_register(slave_dev, NULL);
}
out:
write_unlock_bh(&bond->lock);
}
/*------------------------------- Link status -------------------------------*/
/*
* Set the carrier state for the master according to the state of its
* slaves. If any slaves are up, the master is up. In 802.3ad mode,
* do special 802.3ad magic.
*
* Returns zero if carrier state does not change, nonzero if it does.
*/
static int bond_set_carrier(struct bonding *bond)
{
struct slave *slave;
int i;
if (bond->slave_cnt == 0)
goto down;
if (bond->params.mode == BOND_MODE_8023AD)
return bond_3ad_set_carrier(bond);
bond_for_each_slave(bond, slave, i) {
if (slave->link == BOND_LINK_UP) {
if (!netif_carrier_ok(bond->dev)) {
netif_carrier_on(bond->dev);
return 1;
}
return 0;
}
}
down:
if (netif_carrier_ok(bond->dev)) {
netif_carrier_off(bond->dev);
return 1;
}
return 0;
}
/*
* Get link speed and duplex from the slave's base driver
* using ethtool. If for some reason the call fails or the
* values are invalid, fake speed and duplex to 100/Full
* and return error.
*/
static int bond_update_speed_duplex(struct slave *slave)
{
struct net_device *slave_dev = slave->dev;
static int (* ioctl)(struct net_device *, struct ifreq *, int);
struct ifreq ifr;
struct ethtool_cmd etool;
/* Fake speed and duplex */
slave->speed = SPEED_100;
slave->duplex = DUPLEX_FULL;
if (slave_dev->ethtool_ops) {
int res;
if (!slave_dev->ethtool_ops->get_settings) {
return -1;
}
res = slave_dev->ethtool_ops->get_settings(slave_dev, &etool);
if (res < 0) {
return -1;
}
goto verify;
}
ioctl = slave_dev->do_ioctl;
strncpy(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
etool.cmd = ETHTOOL_GSET;
ifr.ifr_data = (char*)&etool;
if (!ioctl || (IOCTL(slave_dev, &ifr, SIOCETHTOOL) < 0)) {
return -1;
}
verify:
switch (etool.speed) {
case SPEED_10:
case SPEED_100:
case SPEED_1000:
case SPEED_10000:
break;
default:
return -1;
}
switch (etool.duplex) {
case DUPLEX_FULL:
case DUPLEX_HALF:
break;
default:
return -1;
}
slave->speed = etool.speed;
slave->duplex = etool.duplex;
return 0;
}
/*
* if <dev> supports MII link status reporting, check its link status.
*
* We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(),
* depening upon the setting of the use_carrier parameter.
*
* Return either BMSR_LSTATUS, meaning that the link is up (or we
* can't tell and just pretend it is), or 0, meaning that the link is
* down.
*
* If reporting is non-zero, instead of faking link up, return -1 if
* both ETHTOOL and MII ioctls fail (meaning the device does not
* support them). If use_carrier is set, return whatever it says.
* It'd be nice if there was a good way to tell if a driver supports
* netif_carrier, but there really isn't.
*/
static int bond_check_dev_link(struct bonding *bond, struct net_device *slave_dev, int reporting)
{
static int (* ioctl)(struct net_device *, struct ifreq *, int);
struct ifreq ifr;
struct mii_ioctl_data *mii;
struct ethtool_value etool;
if (bond->params.use_carrier) {
return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
}
ioctl = slave_dev->do_ioctl;
if (ioctl) {
/* TODO: set pointer to correct ioctl on a per team member */
/* bases to make this more efficient. that is, once */
/* we determine the correct ioctl, we will always */
/* call it and not the others for that team */
/* member. */
/*
* We cannot assume that SIOCGMIIPHY will also read a
* register; not all network drivers (e.g., e100)
* support that.
*/
/* Yes, the mii is overlaid on the ifreq.ifr_ifru */
strncpy(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
mii = if_mii(&ifr);
if (IOCTL(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
mii->reg_num = MII_BMSR;
if (IOCTL(slave_dev, &ifr, SIOCGMIIREG) == 0) {
return (mii->val_out & BMSR_LSTATUS);
}
}
}
/* try SIOCETHTOOL ioctl, some drivers cache ETHTOOL_GLINK */
/* for a period of time so we attempt to get link status */
/* from it last if the above MII ioctls fail... */
if (slave_dev->ethtool_ops) {
if (slave_dev->ethtool_ops->get_link) {
u32 link;
link = slave_dev->ethtool_ops->get_link(slave_dev);
return link ? BMSR_LSTATUS : 0;
}
}
if (ioctl) {