aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2010-10-11 06:22:12 -0400
committerDavid S. Miller <davem@davemloft.net>2010-10-12 15:35:25 -0400
commit29b4433d991c88d86ca48a4c1cc33c671475be4b (patch)
tree2ad21b86aab8193c4533820c40cd31af97a7377f /net/core
parentf0b9f4725180ea58c8da78b3de0b4e0ad180fc2c (diff)
net: percpu net_device refcount
We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c40
1 files changed, 33 insertions, 7 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafaabd88..04972a4783e2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
5192 */ 5192 */
5193 dev->reg_state = NETREG_DUMMY; 5193 dev->reg_state = NETREG_DUMMY;
5194 5194
5195 /* initialize the ref count */
5196 atomic_set(&dev->refcnt, 1);
5197
5198 /* NAPI wants this */ 5195 /* NAPI wants this */
5199 INIT_LIST_HEAD(&dev->napi_list); 5196 INIT_LIST_HEAD(&dev->napi_list);
5200 5197
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
5202 set_bit(__LINK_STATE_PRESENT, &dev->state); 5199 set_bit(__LINK_STATE_PRESENT, &dev->state);
5203 set_bit(__LINK_STATE_START, &dev->state); 5200 set_bit(__LINK_STATE_START, &dev->state);
5204 5201
5202 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5203 * because users of this 'device' dont need to change
5204 * its refcount.
5205 */
5206
5205 return 0; 5207 return 0;
5206} 5208}
5207EXPORT_SYMBOL_GPL(init_dummy_netdev); 5209EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,16 @@ out:
5243} 5245}
5244EXPORT_SYMBOL(register_netdev); 5246EXPORT_SYMBOL(register_netdev);
5245 5247
5248int netdev_refcnt_read(const struct net_device *dev)
5249{
5250 int i, refcnt = 0;
5251
5252 for_each_possible_cpu(i)
5253 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5254 return refcnt;
5255}
5256EXPORT_SYMBOL(netdev_refcnt_read);
5257
5246/* 5258/*
5247 * netdev_wait_allrefs - wait until all references are gone. 5259 * netdev_wait_allrefs - wait until all references are gone.
5248 * 5260 *
@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev);
5257static void netdev_wait_allrefs(struct net_device *dev) 5269static void netdev_wait_allrefs(struct net_device *dev)
5258{ 5270{
5259 unsigned long rebroadcast_time, warning_time; 5271 unsigned long rebroadcast_time, warning_time;
5272 int refcnt;
5260 5273
5261 linkwatch_forget_dev(dev); 5274 linkwatch_forget_dev(dev);
5262 5275
5263 rebroadcast_time = warning_time = jiffies; 5276 rebroadcast_time = warning_time = jiffies;
5264 while (atomic_read(&dev->refcnt) != 0) { 5277 refcnt = netdev_refcnt_read(dev);
5278
5279 while (refcnt != 0) {
5265 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5280 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5266 rtnl_lock(); 5281 rtnl_lock();
5267 5282
@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
5288 5303
5289 msleep(250); 5304 msleep(250);
5290 5305
5306 refcnt = netdev_refcnt_read(dev);
5307
5291 if (time_after(jiffies, warning_time + 10 * HZ)) { 5308 if (time_after(jiffies, warning_time + 10 * HZ)) {
5292 printk(KERN_EMERG "unregister_netdevice: " 5309 printk(KERN_EMERG "unregister_netdevice: "
5293 "waiting for %s to become free. Usage " 5310 "waiting for %s to become free. Usage "
5294 "count = %d\n", 5311 "count = %d\n",
5295 dev->name, atomic_read(&dev->refcnt)); 5312 dev->name, refcnt);
5296 warning_time = jiffies; 5313 warning_time = jiffies;
5297 } 5314 }
5298 } 5315 }
@@ -5350,7 +5367,7 @@ void netdev_run_todo(void)
5350 netdev_wait_allrefs(dev); 5367 netdev_wait_allrefs(dev);
5351 5368
5352 /* paranoia */ 5369 /* paranoia */
5353 BUG_ON(atomic_read(&dev->refcnt)); 5370 BUG_ON(netdev_refcnt_read(dev));
5354 WARN_ON(rcu_dereference_raw(dev->ip_ptr)); 5371 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5355 WARN_ON(dev->ip6_ptr); 5372 WARN_ON(dev->ip6_ptr);
5356 WARN_ON(dev->dn_ptr); 5373 WARN_ON(dev->dn_ptr);
@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5520 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5537 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5521 dev->padded = (char *)dev - (char *)p; 5538 dev->padded = (char *)dev - (char *)p;
5522 5539
5523 if (dev_addr_init(dev)) 5540 dev->pcpu_refcnt = alloc_percpu(int);
5541 if (!dev->pcpu_refcnt)
5524 goto free_tx; 5542 goto free_tx;
5525 5543
5544 if (dev_addr_init(dev))
5545 goto free_pcpu;
5546
5526 dev_mc_init(dev); 5547 dev_mc_init(dev);
5527 dev_uc_init(dev); 5548 dev_uc_init(dev);
5528 5549
@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5553 5574
5554free_tx: 5575free_tx:
5555 kfree(tx); 5576 kfree(tx);
5577free_pcpu:
5578 free_percpu(dev->pcpu_refcnt);
5556free_p: 5579free_p:
5557 kfree(p); 5580 kfree(p);
5558 return NULL; 5581 return NULL;
@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev)
5586 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5609 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5587 netif_napi_del(p); 5610 netif_napi_del(p);
5588 5611
5612 free_percpu(dev->pcpu_refcnt);
5613 dev->pcpu_refcnt = NULL;
5614
5589 /* Compatibility with error handling in drivers */ 5615 /* Compatibility with error handling in drivers */
5590 if (dev->reg_state == NETREG_UNINITIALIZED) { 5616 if (dev->reg_state == NETREG_UNINITIALIZED) {
5591 kfree((char *)dev - dev->padded); 5617 kfree((char *)dev - dev->padded);