diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-10-11 06:22:12 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-10-12 15:35:25 -0400 |
commit | 29b4433d991c88d86ca48a4c1cc33c671475be4b (patch) | |
tree | 2ad21b86aab8193c4533820c40cd31af97a7377f /net/core | |
parent | f0b9f4725180ea58c8da78b3de0b4e0ad180fc2c (diff) |
net: percpu net_device refcount
We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.
There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)
We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.
On x86, dev_hold(dev) code :
before
lock incl 0x280(%ebx)
after:
movl 0x260(%ebx),%eax
incl fs:(%eax)
Stress bench :
(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)
Before:
real 1m1.662s
user 0m14.373s
sys 12m55.960s
After:
real 0m51.179s
user 0m15.329s
sys 10m15.942s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 40 |
1 files changed, 33 insertions, 7 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 193eafaabd88..04972a4783e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) | |||
5192 | */ | 5192 | */ |
5193 | dev->reg_state = NETREG_DUMMY; | 5193 | dev->reg_state = NETREG_DUMMY; |
5194 | 5194 | ||
5195 | /* initialize the ref count */ | ||
5196 | atomic_set(&dev->refcnt, 1); | ||
5197 | |||
5198 | /* NAPI wants this */ | 5195 | /* NAPI wants this */ |
5199 | INIT_LIST_HEAD(&dev->napi_list); | 5196 | INIT_LIST_HEAD(&dev->napi_list); |
5200 | 5197 | ||
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) | |||
5202 | set_bit(__LINK_STATE_PRESENT, &dev->state); | 5199 | set_bit(__LINK_STATE_PRESENT, &dev->state); |
5203 | set_bit(__LINK_STATE_START, &dev->state); | 5200 | set_bit(__LINK_STATE_START, &dev->state); |
5204 | 5201 | ||
5202 | /* Note : We dont allocate pcpu_refcnt for dummy devices, | ||
5203 | * because users of this 'device' dont need to change | ||
5204 | * its refcount. | ||
5205 | */ | ||
5206 | |||
5205 | return 0; | 5207 | return 0; |
5206 | } | 5208 | } |
5207 | EXPORT_SYMBOL_GPL(init_dummy_netdev); | 5209 | EXPORT_SYMBOL_GPL(init_dummy_netdev); |
@@ -5243,6 +5245,16 @@ out: | |||
5243 | } | 5245 | } |
5244 | EXPORT_SYMBOL(register_netdev); | 5246 | EXPORT_SYMBOL(register_netdev); |
5245 | 5247 | ||
5248 | int netdev_refcnt_read(const struct net_device *dev) | ||
5249 | { | ||
5250 | int i, refcnt = 0; | ||
5251 | |||
5252 | for_each_possible_cpu(i) | ||
5253 | refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); | ||
5254 | return refcnt; | ||
5255 | } | ||
5256 | EXPORT_SYMBOL(netdev_refcnt_read); | ||
5257 | |||
5246 | /* | 5258 | /* |
5247 | * netdev_wait_allrefs - wait until all references are gone. | 5259 | * netdev_wait_allrefs - wait until all references are gone. |
5248 | * | 5260 | * |
@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev); | |||
5257 | static void netdev_wait_allrefs(struct net_device *dev) | 5269 | static void netdev_wait_allrefs(struct net_device *dev) |
5258 | { | 5270 | { |
5259 | unsigned long rebroadcast_time, warning_time; | 5271 | unsigned long rebroadcast_time, warning_time; |
5272 | int refcnt; | ||
5260 | 5273 | ||
5261 | linkwatch_forget_dev(dev); | 5274 | linkwatch_forget_dev(dev); |
5262 | 5275 | ||
5263 | rebroadcast_time = warning_time = jiffies; | 5276 | rebroadcast_time = warning_time = jiffies; |
5264 | while (atomic_read(&dev->refcnt) != 0) { | 5277 | refcnt = netdev_refcnt_read(dev); |
5278 | |||
5279 | while (refcnt != 0) { | ||
5265 | if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { | 5280 | if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { |
5266 | rtnl_lock(); | 5281 | rtnl_lock(); |
5267 | 5282 | ||
@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev) | |||
5288 | 5303 | ||
5289 | msleep(250); | 5304 | msleep(250); |
5290 | 5305 | ||
5306 | refcnt = netdev_refcnt_read(dev); | ||
5307 | |||
5291 | if (time_after(jiffies, warning_time + 10 * HZ)) { | 5308 | if (time_after(jiffies, warning_time + 10 * HZ)) { |
5292 | printk(KERN_EMERG "unregister_netdevice: " | 5309 | printk(KERN_EMERG "unregister_netdevice: " |
5293 | "waiting for %s to become free. Usage " | 5310 | "waiting for %s to become free. Usage " |
5294 | "count = %d\n", | 5311 | "count = %d\n", |
5295 | dev->name, atomic_read(&dev->refcnt)); | 5312 | dev->name, refcnt); |
5296 | warning_time = jiffies; | 5313 | warning_time = jiffies; |
5297 | } | 5314 | } |
5298 | } | 5315 | } |
@@ -5350,7 +5367,7 @@ void netdev_run_todo(void) | |||
5350 | netdev_wait_allrefs(dev); | 5367 | netdev_wait_allrefs(dev); |
5351 | 5368 | ||
5352 | /* paranoia */ | 5369 | /* paranoia */ |
5353 | BUG_ON(atomic_read(&dev->refcnt)); | 5370 | BUG_ON(netdev_refcnt_read(dev)); |
5354 | WARN_ON(rcu_dereference_raw(dev->ip_ptr)); | 5371 | WARN_ON(rcu_dereference_raw(dev->ip_ptr)); |
5355 | WARN_ON(dev->ip6_ptr); | 5372 | WARN_ON(dev->ip6_ptr); |
5356 | WARN_ON(dev->dn_ptr); | 5373 | WARN_ON(dev->dn_ptr); |
@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, | |||
5520 | dev = PTR_ALIGN(p, NETDEV_ALIGN); | 5537 | dev = PTR_ALIGN(p, NETDEV_ALIGN); |
5521 | dev->padded = (char *)dev - (char *)p; | 5538 | dev->padded = (char *)dev - (char *)p; |
5522 | 5539 | ||
5523 | if (dev_addr_init(dev)) | 5540 | dev->pcpu_refcnt = alloc_percpu(int); |
5541 | if (!dev->pcpu_refcnt) | ||
5524 | goto free_tx; | 5542 | goto free_tx; |
5525 | 5543 | ||
5544 | if (dev_addr_init(dev)) | ||
5545 | goto free_pcpu; | ||
5546 | |||
5526 | dev_mc_init(dev); | 5547 | dev_mc_init(dev); |
5527 | dev_uc_init(dev); | 5548 | dev_uc_init(dev); |
5528 | 5549 | ||
@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, | |||
5553 | 5574 | ||
5554 | free_tx: | 5575 | free_tx: |
5555 | kfree(tx); | 5576 | kfree(tx); |
5577 | free_pcpu: | ||
5578 | free_percpu(dev->pcpu_refcnt); | ||
5556 | free_p: | 5579 | free_p: |
5557 | kfree(p); | 5580 | kfree(p); |
5558 | return NULL; | 5581 | return NULL; |
@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev) | |||
5586 | list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) | 5609 | list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) |
5587 | netif_napi_del(p); | 5610 | netif_napi_del(p); |
5588 | 5611 | ||
5612 | free_percpu(dev->pcpu_refcnt); | ||
5613 | dev->pcpu_refcnt = NULL; | ||
5614 | |||
5589 | /* Compatibility with error handling in drivers */ | 5615 | /* Compatibility with error handling in drivers */ |
5590 | if (dev->reg_state == NETREG_UNINITIALIZED) { | 5616 | if (dev->reg_state == NETREG_UNINITIALIZED) { |
5591 | kfree((char *)dev - dev->padded); | 5617 | kfree((char *)dev - dev->padded); |