aboutsummaryrefslogblamecommitdiffstats
path: root/drivers/cpufreq/speedstep-ich.c
blob: e29b59aa68a8b3173a41b4151c1dd6cdde609372 (plain) (tree)
























                                                                         
                        
 

                              












                                                                  
                                                    
 
                  











                                                                   
   
                                                    
  
                                                 
   
                                        
 

                                   




                                                                                      
                               




                                                                                      
                               

         
                                             






                                                                          

                                                             
   
                                                   







                            





                                   
                                                                              




                             
                                                                        


















                                                
                                                                              
 
                                   
                                                        


                                                                             



               




                                                     






                                                                  
                                   








                                                                    
                                                                  














                                                                                   
                                                  


                                                                   
                                                     





                                                                   
                                                     






                                                                   
                                                     



                                                                        
                                                       

                                                  


                                                                 
                                                     




                                           
                                               
                                                                            











                                                     
                                       
 
                                     
 
                                                              



                                                   
                           

                                                      
                                                                         

                      
                                                                  
                     





                                              

                                                              


                             
                                                          


                                                      
                                              
                                   

              

                                                                       

                               

                                                                    


                                                        
                                                                         




                                     
                                       



                                                                     

                                                                             
 
                                       














                                                                        
                                                          



                                                                           















                                                                           


                                                            


                                       


                                                          
                                                                  
      
                                                                    
 
                                                                  



                                                                       

                                       
                                          


                            
                                                            

                                                                   


                                               



                                                                          
                              
 
                                                                       










                                                            
                                             




                                                   
                                                 









                                     









                                               









                                                                      


                                       


                                                           
                                                                    
                                               




                                          
                                                                       
                                                      








                                                   


                                      















                                                          




                                                                              


                            
/*
 * (C) 2001  Dave Jones, Arjan van de ven.
 * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
 *
 *  Licensed under the terms of the GNU GPL License version 2.
 *  Based upon reverse engineered information, and on Intel documentation
 *  for chipsets ICH2-M and ICH3-M.
 *
 *  Many thanks to Ducrot Bruno for finding and fixing the last
 *  "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
 *  for extensive testing.
 *
 *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
 */


/*********************************************************************
 *                        SPEEDSTEP - DEFINITIONS                    *
 *********************************************************************/

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/cpufreq.h>
#include <linux/pci.h>
#include <linux/sched.h>

#include <asm/cpu_device_id.h>

#include "speedstep-lib.h"


/* speedstep_chipset:
 *   It is necessary to know which chipset is used. As accesses to
 * this device occur at various places in this module, we need a
 * static struct pci_dev * pointing to that device.
 */
static struct pci_dev *speedstep_chipset_dev;


/* speedstep_processor
 */
static enum speedstep_processor speedstep_processor;

static u32 pmbase;

/*
 *   There are only two frequency states for each processor. Values
 * are in kHz for the time being.
 */
static struct cpufreq_frequency_table speedstep_freqs[] = {
	{SPEEDSTEP_HIGH,	0},
	{SPEEDSTEP_LOW,		0},
	{0,			CPUFREQ_TABLE_END},
};


/**
 * speedstep_find_register - read the PMBASE address
 *
 * Returns: -ENODEV if no register could be found
 */
static int speedstep_find_register(void)
{
	if (!speedstep_chipset_dev)
		return -ENODEV;

	/* get PMBASE */
	pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
	if (!(pmbase & 0x01)) {
		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
		return -ENODEV;
	}

	pmbase &= 0xFFFFFFFE;
	if (!pmbase) {
		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
		return -ENODEV;
	}

	pr_debug("pmbase is 0x%x\n", pmbase);
	return 0;
}

/**
 * speedstep_set_state - set the SpeedStep state
 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
 *
 *   Tries to change the SpeedStep state.  Can be called from
 *   smp_call_function_single.
 */
static void speedstep_set_state(unsigned int state)
{
	u8 pm2_blk;
	u8 value;
	unsigned long flags;

	if (state > 0x1)
		return;

	/* Disable IRQs */
	local_irq_save(flags);

	/* read state */
	value = inb(pmbase + 0x50);

	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);

	/* write new state */
	value &= 0xFE;
	value |= state;

	pr_debug("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);

	/* Disable bus master arbitration */
	pm2_blk = inb(pmbase + 0x20);
	pm2_blk |= 0x01;
	outb(pm2_blk, (pmbase + 0x20));

	/* Actual transition */
	outb(value, (pmbase + 0x50));

	/* Restore bus master arbitration */
	pm2_blk &= 0xfe;
	outb(pm2_blk, (pmbase + 0x20));

	/* check if transition was successful */
	value = inb(pmbase + 0x50);

	/* Enable IRQs */
	local_irq_restore(flags);

	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);

	if (state == (value & 0x1))
		pr_debug("change to %u MHz succeeded\n",
			speedstep_get_frequency(speedstep_processor) / 1000);
	else
		printk(KERN_ERR "cpufreq: change failed - I/O error\n");

	return;
}

/* Wrapper for smp_call_function_single. */
static void _speedstep_set_state(void *_state)
{
	speedstep_set_state(*(unsigned int *)_state);
}

/**
 * speedstep_activate - activate SpeedStep control in the chipset
 *
 *   Tries to activate the SpeedStep status and control registers.
 * Returns -EINVAL on an unsupported chipset, and zero on success.
 */
static int speedstep_activate(void)
{
	u16 value = 0;

	if (!speedstep_chipset_dev)
		return -EINVAL;

	pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
	if (!(value & 0x08)) {
		value |= 0x08;
		pr_debug("activating SpeedStep (TM) registers\n");
		pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
	}

	return 0;
}


/**
 * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
 *
 *   Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
 * the LPC bridge / PM module which contains all power-management
 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
 * chipset, or zero on failure.
 */
static unsigned int speedstep_detect_chipset(void)
{
	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
			      PCI_DEVICE_ID_INTEL_82801DB_12,
			      PCI_ANY_ID, PCI_ANY_ID,
			      NULL);
	if (speedstep_chipset_dev)
		return 4; /* 4-M */

	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
			      PCI_DEVICE_ID_INTEL_82801CA_12,
			      PCI_ANY_ID, PCI_ANY_ID,
			      NULL);
	if (speedstep_chipset_dev)
		return 3; /* 3-M */


	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
			      PCI_DEVICE_ID_INTEL_82801BA_10,
			      PCI_ANY_ID, PCI_ANY_ID,
			      NULL);
	if (speedstep_chipset_dev) {
		/* speedstep.c causes lockups on Dell Inspirons 8000 and
		 * 8100 which use a pretty old revision of the 82815
		 * host bridge. Abort on these systems.
		 */
		static struct pci_dev *hostbridge;

		hostbridge  = pci_get_subsys(PCI_VENDOR_ID_INTEL,
			      PCI_DEVICE_ID_INTEL_82815_MC,
			      PCI_ANY_ID, PCI_ANY_ID,
			      NULL);

		if (!hostbridge)
			return 2; /* 2-M */

		if (hostbridge->revision < 5) {
			pr_debug("hostbridge does not support speedstep\n");
			speedstep_chipset_dev = NULL;
			pci_dev_put(hostbridge);
			return 0;
		}

		pci_dev_put(hostbridge);
		return 2; /* 2-M */
	}

	return 0;
}

static void get_freq_data(void *_speed)
{
	unsigned int *speed = _speed;

	*speed = speedstep_get_frequency(speedstep_processor);
}

static unsigned int speedstep_get(unsigned int cpu)
{
	unsigned int speed;

	/* You're supposed to ensure CPU is online. */
	if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
		BUG();

	pr_debug("detected %u kHz as current frequency\n", speed);
	return speed;
}

/**
 * speedstep_target - set a new CPUFreq policy
 * @policy: new policy
 * @target_freq: the target frequency
 * @relation: how that frequency relates to achieved frequency
 *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
 *
 * Sets a new CPUFreq policy.
 */
static int speedstep_target(struct cpufreq_policy *policy,
			     unsigned int target_freq,
			     unsigned int relation)
{
	unsigned int newstate = 0, policy_cpu;
	struct cpufreq_freqs freqs;
	int i;

	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
				target_freq, relation, &newstate))
		return -EINVAL;

	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
	freqs.old = speedstep_get(policy_cpu);
	freqs.new = speedstep_freqs[newstate].frequency;
	freqs.cpu = policy->cpu;

	pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new);

	/* no transition necessary */
	if (freqs.old == freqs.new)
		return 0;

	for_each_cpu(i, policy->cpus) {
		freqs.cpu = i;
		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
	}

	smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
				 true);

	for_each_cpu(i, policy->cpus) {
		freqs.cpu = i;
		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
	}

	return 0;
}


/**
 * speedstep_verify - verifies a new CPUFreq policy
 * @policy: new policy
 *
 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
 * at least one border included.
 */
static int speedstep_verify(struct cpufreq_policy *policy)
{
	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
}

struct get_freqs {
	struct cpufreq_policy *policy;
	int ret;
};

static void get_freqs_on_cpu(void *_get_freqs)
{
	struct get_freqs *get_freqs = _get_freqs;

	get_freqs->ret =
		speedstep_get_freqs(speedstep_processor,
			    &speedstep_freqs[SPEEDSTEP_LOW].frequency,
			    &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
			    &get_freqs->policy->cpuinfo.transition_latency,
			    &speedstep_set_state);
}

static int speedstep_cpu_init(struct cpufreq_policy *policy)
{
	int result;
	unsigned int policy_cpu, speed;
	struct get_freqs gf;

	/* only run on CPU to be set, or on its sibling */
#ifdef CONFIG_SMP
	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
#endif
	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);

	/* detect low and high frequency and transition latency */
	gf.policy = policy;
	smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
	if (gf.ret)
		return gf.ret;

	/* get current speed setting */
	speed = speedstep_get(policy_cpu);
	if (!speed)
		return -EIO;

	pr_debug("currently at %s speed setting - %i MHz\n",
		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
		? "low" : "high",
		(speed / 1000));

	/* cpuinfo and default policy values */
	policy->cur = speed;

	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
	if (result)
		return result;

	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);

	return 0;
}


static int speedstep_cpu_exit(struct cpufreq_policy *policy)
{
	cpufreq_frequency_table_put_attr(policy->cpu);
	return 0;
}

static struct freq_attr *speedstep_attr[] = {
	&cpufreq_freq_attr_scaling_available_freqs,
	NULL,
};


static struct cpufreq_driver speedstep_driver = {
	.name	= "speedstep-ich",
	.verify	= speedstep_verify,
	.target	= speedstep_target,
	.init	= speedstep_cpu_init,
	.exit	= speedstep_cpu_exit,
	.get	= speedstep_get,
	.owner	= THIS_MODULE,
	.attr	= speedstep_attr,
};

static const struct x86_cpu_id ss_smi_ids[] = {
	{ X86_VENDOR_INTEL, 6, 0xb, },
	{ X86_VENDOR_INTEL, 6, 0x8, },
	{ X86_VENDOR_INTEL, 15, 2 },
	{}
};
#if 0
/* Autoload or not? Do not for now. */
MODULE_DEVICE_TABLE(x86cpu, ss_smi_ids);
#endif

/**
 * speedstep_init - initializes the SpeedStep CPUFreq driver
 *
 *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
 * devices, -EINVAL on problems during initiatization, and zero on
 * success.
 */
static int __init speedstep_init(void)
{
	if (!x86_match_cpu(ss_smi_ids))
		return -ENODEV;

	/* detect processor */
	speedstep_processor = speedstep_detect_processor();
	if (!speedstep_processor) {
		pr_debug("Intel(R) SpeedStep(TM) capable processor "
				"not found\n");
		return -ENODEV;
	}

	/* detect chipset */
	if (!speedstep_detect_chipset()) {
		pr_debug("Intel(R) SpeedStep(TM) for this chipset not "
				"(yet) available.\n");
		return -ENODEV;
	}

	/* activate speedstep support */
	if (speedstep_activate()) {
		pci_dev_put(speedstep_chipset_dev);
		return -EINVAL;
	}

	if (speedstep_find_register())
		return -ENODEV;

	return cpufreq_register_driver(&speedstep_driver);
}


/**
 * speedstep_exit - unregisters SpeedStep support
 *
 *   Unregisters SpeedStep support.
 */
static void __exit speedstep_exit(void)
{
	pci_dev_put(speedstep_chipset_dev);
	cpufreq_unregister_driver(&speedstep_driver);
}


MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
		"Dominik Brodowski <linux@brodo.de>");
MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
		"with ICH-M southbridges.");
MODULE_LICENSE("GPL");

module_init(speedstep_init);
module_exit(speedstep_exit);
span class="hl esc">\n", be16_to_cpu(path->pathrec.dlid)); spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { __skb_queue_tail(&path->queue, skb); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } spin_unlock_irqrestore(&priv->lock, flags); } static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh *neigh; struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; struct ipoib_header *header; unsigned long flags; header = (struct ipoib_header *) skb->data; if (unlikely(cb->hwaddr[4] == 0xff)) { /* multicast, arrange "if" according to probability */ if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_ARP)) && (header->proto != htons(ETH_P_RARP)) && (header->proto != htons(ETH_P_TIPC))) { /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* Add in the P_Key for multicast*/ cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; cb->hwaddr[9] = priv->pkey & 0xff; neigh = ipoib_neigh_get(dev, cb->hwaddr); if (likely(neigh)) goto send_using_neigh; ipoib_mcast_send(dev, cb->hwaddr, skb); return NETDEV_TX_OK; } /* unicast, arrange "switch" according to probability */ switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, cb->hwaddr); if (unlikely(!neigh)) { neigh_add_path(skb, cb->hwaddr, dev); return NETDEV_TX_OK; } break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): /* for unicast ARP and RARP should always perform path find */ unicast_arp_send(skb, dev, cb); return NETDEV_TX_OK; default: /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } send_using_neigh: /* note we now hold a ref to neigh */ if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } } else if (neigh->ah) { ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); goto unref; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } unref: ipoib_neigh_put(neigh); return NETDEV_TX_OK; } static void ipoib_timeout(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_warn(priv, "transmit timeout: latency %d msecs\n", jiffies_to_msecs(jiffies - dev->trans_start)); ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", netif_queue_stopped(dev), priv->tx_head, priv->tx_tail); /* XXX reset QP, etc. */ } static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { struct ipoib_header *header; struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; header = (struct ipoib_header *) skb_push(skb, sizeof *header); header->proto = htons(type); header->reserved = 0; /* * we don't rely on dst_entry structure, always stuff the * destination address into skb->cb so we can figure out where * to send the packet later. */ memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); return sizeof *header; } static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); return; } queue_work(ipoib_workqueue, &priv->restart_task); } static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { /* * Use only the address parts that contributes to spreading * The subnet prefix is not used as one can not connect to * same remote port (GUID) using the same remote QPN via two * different subnets. */ /* qpn octets[1:4) & port GUID octets[12:20) */ u32 *d32 = (u32 *) daddr; u32 hv; hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); return hv & htbl->mask; } struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh = NULL; u32 hash_val; rcu_read_lock_bh(); htbl = rcu_dereference_bh(ntbl->htbl); if (!htbl) goto out_unlock; hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); neigh != NULL; neigh = rcu_dereference_bh(neigh->hnext)) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!atomic_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; goto out_unlock; } neigh->alive = jiffies; goto out_unlock; } } out_unlock: rcu_read_unlock_bh(); return neigh; } static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long neigh_obsolete; unsigned long dt; unsigned long flags; int i; if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) return; spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; /* neigh is obsolete if it was idle for two GC periods */ dt = 2 * arp_tbl.gc_interval; neigh_obsolete = jiffies - dt; /* handle possible race condition */ if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_reap_neigh(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); __ipoib_reap_neigh(priv); if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, arp_tbl.gc_interval); } static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct net_device *dev) { struct ipoib_neigh *neigh; neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); if (!neigh) return NULL; neigh->dev = dev; memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); skb_queue_head_init(&neigh->queue); INIT_LIST_HEAD(&neigh->list); ipoib_cm_set(neigh, NULL); /* one ref on behalf of the caller */ atomic_set(&neigh->refcnt, 1); return neigh; } struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) { neigh = NULL; goto out_unlock; } /* need to add a new neigh, but maybe some other thread succeeded? * recalc hash, maybe hash resize took place so we do a search */ hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock)); neigh != NULL; neigh = rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!atomic_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; break; } neigh->alive = jiffies; goto out_unlock; } } neigh = ipoib_neigh_ctor(daddr, dev); if (!neigh) goto out_unlock; /* one ref on behalf of the hash table */ atomic_inc(&neigh->refcnt); neigh->alive = jiffies; /* put in hash */ rcu_assign_pointer(neigh->hnext, rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock))); rcu_assign_pointer(htbl->buckets[hash_val], neigh); atomic_inc(&ntbl->entries); out_unlock: return neigh; } void ipoib_neigh_dtor(struct ipoib_neigh *neigh) { /* neigh reference count was dropprd to zero */ struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; if (neigh->ah) ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); ipoib_dbg(netdev_priv(dev), "neigh free for %06x %pI6\n", IPOIB_QPN(neigh->daddr), neigh->daddr + 4); kfree(neigh); if (atomic_dec_and_test(&priv->ntbl.entries)) { if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) complete(&priv->ntbl.flushed); } } static void ipoib_neigh_reclaim(struct rcu_head *rp) { /* Called as a result of removal from hash table */ struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); /* note TX context may hold another ref */ ipoib_neigh_put(neigh); } void ipoib_neigh_free(struct ipoib_neigh *neigh) { struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **np; struct ipoib_neigh *n; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) return; hash_val = ipoib_addr_hash(htbl, neigh->daddr); np = &htbl->buckets[hash_val]; for (n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock)); n != NULL; n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) { if (n == neigh) { /* found */ rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { np = &n->hnext; } } } static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh **buckets; u32 size; clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); ntbl->htbl = NULL; htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); if (!htbl) return -ENOMEM; set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); size = roundup_pow_of_two(arp_tbl.gc_thresh3); buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); if (!buckets) { kfree(htbl); return -ENOMEM; } htbl->size = size; htbl->mask = (size - 1); htbl->buckets = buckets; ntbl->htbl = htbl; htbl->ntbl = ntbl; atomic_set(&ntbl->entries, 0); /* start garbage collection */ clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct ipoib_neigh_hash *htbl = container_of(head, struct ipoib_neigh_hash, rcu); struct ipoib_neigh __rcu **buckets = htbl->buckets; struct ipoib_neigh_table *ntbl = htbl->ntbl; kfree(buckets); kfree(htbl); complete(&ntbl->deleted); } void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i; /* remove all neigh connected to a given path or mcast */ spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* delete neighs belong to this parent */ if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i, wait_flushed = 0; init_completion(&priv->ntbl.flushed); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; wait_flushed = atomic_read(&priv->ntbl.entries); if (!wait_flushed) goto free_htbl; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } free_htbl: rcu_assign_pointer(ntbl->htbl, NULL); call_rcu(&htbl->rcu, neigh_hash_free_rcu); out_unlock: spin_unlock_irqrestore(&priv->lock, flags); if (wait_flushed) wait_for_completion(&priv->ntbl.flushed); } static void ipoib_neigh_hash_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); int stopped; ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); init_completion(&priv->ntbl.deleted); set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); /* Stop GC if called at init fail need to cancel work */ stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); if (!stopped) cancel_delayed_work(&priv->neigh_reap_task); ipoib_flush_neighs(priv); wait_for_completion(&priv->ntbl.deleted); } int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); if (ipoib_neigh_hash_init(priv) < 0) goto out; /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out_neigh_hash_cleanup; } priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out_neigh_hash_cleanup: ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } void ipoib_dev_cleanup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; LIST_HEAD(head); ASSERT_RTNL(); ipoib_delete_debug_files(dev); /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { /* Stop GC on child */ set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); cancel_delayed_work(&cpriv->neigh_reap_task); unregister_netdevice_queue(cpriv->dev, &head); } unregister_netdevice_many(&head); ipoib_ib_dev_cleanup(dev); kfree(priv->rx_ring); vfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; ipoib_neigh_hash_uninit(dev); } static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; static const struct net_device_ops ipoib_netdev_ops = { .ndo_uninit = ipoib_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, }; void ipoib_setup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); dev->netdev_ops = &ipoib_netdev_ops; dev->header_ops = &ipoib_header_ops; ipoib_set_ethtool_ops(dev); netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); dev->watchdog_timeo = HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; dev->hard_header_len = IPOIB_ENCAP_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = ipoib_sendq_size * 2; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); priv->dev = dev; spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) { struct net_device *dev; dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, ipoib_setup); if (!dev) return NULL; return netdev_priv(dev); } static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, char *buf) { struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); return sprintf(buf, "0x%04x\n", priv->pkey); } static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); static ssize_t show_umcast(struct device *dev, struct device_attribute *attr, char *buf) { struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { struct ipoib_dev_priv *priv = netdev_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); ipoib_warn(priv, "ignoring multicast groups joined directly " "by userspace\n"); } else clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); } static ssize_t set_umcast(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long umcast_val = simple_strtoul(buf, NULL, 0); ipoib_set_umcast(to_net_dev(dev), umcast_val); return count; } static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast); int ipoib_add_umcast_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_umcast); } static ssize_t create_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ pkey |= 0x8000; ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child); static ssize_t delete_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey < 0 || pkey > 0xffff) return -EINVAL; ret = ipoib_vlan_delete(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_pkey); } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr; int result = -ENOMEM; device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); if (!device_attr) { printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", hca->name, sizeof *device_attr); return result; } result = ib_query_device(hca, device_attr); if (result) { printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", hca->name, result); kfree(device_attr); return result; } priv->hca_caps = device_attr->device_cap_flags; kfree(device_attr); if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { priv->dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM; if (priv->hca_caps & IB_DEVICE_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; priv->dev->features |= priv->dev->hw_features; } return 0; } static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; SET_NETDEV_DEV(priv->dev, hca->dma_device); priv->dev->dev_id = port - 1; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->dev->broadcast[8] = priv->pkey >> 8; priv->dev->broadcast[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv->dev, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } result = register_netdev(priv->dev); if (result) { printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", hca->name, port, result); goto register_failed; } ipoib_create_debug_files(priv->dev); if (ipoib_cm_add_mode_attr(priv->dev)) goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; if (ipoib_add_umcast_attr(priv->dev)) goto sysfs_failed; if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) goto sysfs_failed; if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) goto sysfs_failed; return priv->dev; sysfs_failed: ipoib_delete_debug_files(priv->dev); unregister_netdev(priv->dev); register_failed: ib_unregister_event_handler(&priv->event_handler); /* Stop GC if started before flush */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(ipoib_workqueue); event_failed: ipoib_dev_cleanup(priv->dev); device_init_failed: free_netdev(priv->dev); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct net_device *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = netdev_priv(dev); list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = ib_get_client_data(device, &ipoib_client); if (!dev_list) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); rtnl_unlock(); /* Stop GC */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(ipoib_workqueue); unregister_netdev(priv->dev); free_netdev(priv->dev); } kfree(dev_list); } static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif /* * When copying small received packets, we only copy from the * linear data part of the SKB, so we rely on this condition. */ BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); ret = ipoib_register_debugfs(); if (ret) return ret; /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; ret = ipoib_netlink_init(); if (ret) goto err_client; return 0; err_client: ib_unregister_client(&ipoib_client); err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: ipoib_unregister_debugfs(); return ret; } static void __exit ipoib_cleanup_module(void) { ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module);