diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-10-11 06:22:12 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-10-12 15:35:25 -0400 |
commit | 29b4433d991c88d86ca48a4c1cc33c671475be4b (patch) | |
tree | 2ad21b86aab8193c4533820c40cd31af97a7377f | |
parent | f0b9f4725180ea58c8da78b3de0b4e0ad180fc2c (diff) |
net: percpu net_device refcount
We tried very hard to remove all possible dev_hold()/dev_put() pairs in
network stack, using RCU conversions.
There is still an unavoidable device refcount change for every dst we
create/destroy, and this can slow down some workloads (routers or some
app servers, mmap af_packet)
We can switch to a percpu refcount implementation, now dynamic per_cpu
infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes
per device.
On x86, dev_hold(dev) code :
before
lock incl 0x280(%ebx)
after:
movl 0x260(%ebx),%eax
incl fs:(%eax)
Stress bench :
(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE)
Before:
real 1m1.662s
user 0m14.373s
sys 12m55.960s
After:
real 0m51.179s
user 0m15.329s
sys 10m15.942s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/infiniband/hw/nes/nes_cm.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/hw/nes/nes_verbs.c | 4 | ||||
-rw-r--r-- | include/linux/netdevice.h | 7 | ||||
-rw-r--r-- | net/core/dev.c | 40 |
4 files changed, 41 insertions, 14 deletions
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 61e0efd4ccfb..6220d9d75b58 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c | |||
@@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt) | |||
2701 | nesibdev = nesvnic->nesibdev; | 2701 | nesibdev = nesvnic->nesibdev; |
2702 | 2702 | ||
2703 | nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", | 2703 | nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", |
2704 | atomic_read(&nesvnic->netdev->refcnt)); | 2704 | netdev_refcnt_read(nesvnic->netdev)); |
2705 | 2705 | ||
2706 | if (nesqp->active_conn) { | 2706 | if (nesqp->active_conn) { |
2707 | 2707 | ||
@@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) | |||
2791 | atomic_inc(&cm_accepts); | 2791 | atomic_inc(&cm_accepts); |
2792 | 2792 | ||
2793 | nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", | 2793 | nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", |
2794 | atomic_read(&nesvnic->netdev->refcnt)); | 2794 | netdev_refcnt_read(nesvnic->netdev)); |
2795 | 2795 | ||
2796 | /* allocate the ietf frame and space for private data */ | 2796 | /* allocate the ietf frame and space for private data */ |
2797 | nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, | 2797 | nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, |
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 9046e6675686..546fc22405fe 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c | |||
@@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, | |||
785 | 785 | ||
786 | nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", | 786 | nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", |
787 | nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, | 787 | nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, |
788 | atomic_read(&nesvnic->netdev->refcnt)); | 788 | netdev_refcnt_read(nesvnic->netdev)); |
789 | 789 | ||
790 | err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, | 790 | err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, |
791 | nesadapter->max_pd, &pd_num, &nesadapter->next_pd); | 791 | nesadapter->max_pd, &pd_num, &nesadapter->next_pd); |
@@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, | |||
1416 | /* update the QP table */ | 1416 | /* update the QP table */ |
1417 | nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; | 1417 | nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; |
1418 | nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", | 1418 | nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", |
1419 | atomic_read(&nesvnic->netdev->refcnt)); | 1419 | netdev_refcnt_read(nesvnic->netdev)); |
1420 | 1420 | ||
1421 | return &nesqp->ibqp; | 1421 | return &nesqp->ibqp; |
1422 | } | 1422 | } |
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4160db3721ba..14fbb04c459d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
@@ -1026,7 +1026,7 @@ struct net_device { | |||
1026 | struct timer_list watchdog_timer; | 1026 | struct timer_list watchdog_timer; |
1027 | 1027 | ||
1028 | /* Number of references to this device */ | 1028 | /* Number of references to this device */ |
1029 | atomic_t refcnt ____cacheline_aligned_in_smp; | 1029 | int __percpu *pcpu_refcnt; |
1030 | 1030 | ||
1031 | /* delayed register/unregister */ | 1031 | /* delayed register/unregister */ |
1032 | struct list_head todo_list; | 1032 | struct list_head todo_list; |
@@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev) | |||
1330 | unregister_netdevice_queue(dev, NULL); | 1330 | unregister_netdevice_queue(dev, NULL); |
1331 | } | 1331 | } |
1332 | 1332 | ||
1333 | extern int netdev_refcnt_read(const struct net_device *dev); | ||
1333 | extern void free_netdev(struct net_device *dev); | 1334 | extern void free_netdev(struct net_device *dev); |
1334 | extern void synchronize_net(void); | 1335 | extern void synchronize_net(void); |
1335 | extern int register_netdevice_notifier(struct notifier_block *nb); | 1336 | extern int register_netdevice_notifier(struct notifier_block *nb); |
@@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void); | |||
1798 | */ | 1799 | */ |
1799 | static inline void dev_put(struct net_device *dev) | 1800 | static inline void dev_put(struct net_device *dev) |
1800 | { | 1801 | { |
1801 | atomic_dec(&dev->refcnt); | 1802 | irqsafe_cpu_dec(*dev->pcpu_refcnt); |
1802 | } | 1803 | } |
1803 | 1804 | ||
1804 | /** | 1805 | /** |
@@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev) | |||
1809 | */ | 1810 | */ |
1810 | static inline void dev_hold(struct net_device *dev) | 1811 | static inline void dev_hold(struct net_device *dev) |
1811 | { | 1812 | { |
1812 | atomic_inc(&dev->refcnt); | 1813 | irqsafe_cpu_inc(*dev->pcpu_refcnt); |
1813 | } | 1814 | } |
1814 | 1815 | ||
1815 | /* Carrier loss detection, dial on demand. The functions netif_carrier_on | 1816 | /* Carrier loss detection, dial on demand. The functions netif_carrier_on |
diff --git a/net/core/dev.c b/net/core/dev.c index 193eafaabd88..04972a4783e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) | |||
5192 | */ | 5192 | */ |
5193 | dev->reg_state = NETREG_DUMMY; | 5193 | dev->reg_state = NETREG_DUMMY; |
5194 | 5194 | ||
5195 | /* initialize the ref count */ | ||
5196 | atomic_set(&dev->refcnt, 1); | ||
5197 | |||
5198 | /* NAPI wants this */ | 5195 | /* NAPI wants this */ |
5199 | INIT_LIST_HEAD(&dev->napi_list); | 5196 | INIT_LIST_HEAD(&dev->napi_list); |
5200 | 5197 | ||
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) | |||
5202 | set_bit(__LINK_STATE_PRESENT, &dev->state); | 5199 | set_bit(__LINK_STATE_PRESENT, &dev->state); |
5203 | set_bit(__LINK_STATE_START, &dev->state); | 5200 | set_bit(__LINK_STATE_START, &dev->state); |
5204 | 5201 | ||
5202 | /* Note : We dont allocate pcpu_refcnt for dummy devices, | ||
5203 | * because users of this 'device' dont need to change | ||
5204 | * its refcount. | ||
5205 | */ | ||
5206 | |||
5205 | return 0; | 5207 | return 0; |
5206 | } | 5208 | } |
5207 | EXPORT_SYMBOL_GPL(init_dummy_netdev); | 5209 | EXPORT_SYMBOL_GPL(init_dummy_netdev); |
@@ -5243,6 +5245,16 @@ out: | |||
5243 | } | 5245 | } |
5244 | EXPORT_SYMBOL(register_netdev); | 5246 | EXPORT_SYMBOL(register_netdev); |
5245 | 5247 | ||
5248 | int netdev_refcnt_read(const struct net_device *dev) | ||
5249 | { | ||
5250 | int i, refcnt = 0; | ||
5251 | |||
5252 | for_each_possible_cpu(i) | ||
5253 | refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); | ||
5254 | return refcnt; | ||
5255 | } | ||
5256 | EXPORT_SYMBOL(netdev_refcnt_read); | ||
5257 | |||
5246 | /* | 5258 | /* |
5247 | * netdev_wait_allrefs - wait until all references are gone. | 5259 | * netdev_wait_allrefs - wait until all references are gone. |
5248 | * | 5260 | * |
@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev); | |||
5257 | static void netdev_wait_allrefs(struct net_device *dev) | 5269 | static void netdev_wait_allrefs(struct net_device *dev) |
5258 | { | 5270 | { |
5259 | unsigned long rebroadcast_time, warning_time; | 5271 | unsigned long rebroadcast_time, warning_time; |
5272 | int refcnt; | ||
5260 | 5273 | ||
5261 | linkwatch_forget_dev(dev); | 5274 | linkwatch_forget_dev(dev); |
5262 | 5275 | ||
5263 | rebroadcast_time = warning_time = jiffies; | 5276 | rebroadcast_time = warning_time = jiffies; |
5264 | while (atomic_read(&dev->refcnt) != 0) { | 5277 | refcnt = netdev_refcnt_read(dev); |
5278 | |||
5279 | while (refcnt != 0) { | ||
5265 | if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { | 5280 | if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { |
5266 | rtnl_lock(); | 5281 | rtnl_lock(); |
5267 | 5282 | ||
@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev) | |||
5288 | 5303 | ||
5289 | msleep(250); | 5304 | msleep(250); |
5290 | 5305 | ||
5306 | refcnt = netdev_refcnt_read(dev); | ||
5307 | |||
5291 | if (time_after(jiffies, warning_time + 10 * HZ)) { | 5308 | if (time_after(jiffies, warning_time + 10 * HZ)) { |
5292 | printk(KERN_EMERG "unregister_netdevice: " | 5309 | printk(KERN_EMERG "unregister_netdevice: " |
5293 | "waiting for %s to become free. Usage " | 5310 | "waiting for %s to become free. Usage " |
5294 | "count = %d\n", | 5311 | "count = %d\n", |
5295 | dev->name, atomic_read(&dev->refcnt)); | 5312 | dev->name, refcnt); |
5296 | warning_time = jiffies; | 5313 | warning_time = jiffies; |
5297 | } | 5314 | } |
5298 | } | 5315 | } |
@@ -5350,7 +5367,7 @@ void netdev_run_todo(void) | |||
5350 | netdev_wait_allrefs(dev); | 5367 | netdev_wait_allrefs(dev); |
5351 | 5368 | ||
5352 | /* paranoia */ | 5369 | /* paranoia */ |
5353 | BUG_ON(atomic_read(&dev->refcnt)); | 5370 | BUG_ON(netdev_refcnt_read(dev)); |
5354 | WARN_ON(rcu_dereference_raw(dev->ip_ptr)); | 5371 | WARN_ON(rcu_dereference_raw(dev->ip_ptr)); |
5355 | WARN_ON(dev->ip6_ptr); | 5372 | WARN_ON(dev->ip6_ptr); |
5356 | WARN_ON(dev->dn_ptr); | 5373 | WARN_ON(dev->dn_ptr); |
@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, | |||
5520 | dev = PTR_ALIGN(p, NETDEV_ALIGN); | 5537 | dev = PTR_ALIGN(p, NETDEV_ALIGN); |
5521 | dev->padded = (char *)dev - (char *)p; | 5538 | dev->padded = (char *)dev - (char *)p; |
5522 | 5539 | ||
5523 | if (dev_addr_init(dev)) | 5540 | dev->pcpu_refcnt = alloc_percpu(int); |
5541 | if (!dev->pcpu_refcnt) | ||
5524 | goto free_tx; | 5542 | goto free_tx; |
5525 | 5543 | ||
5544 | if (dev_addr_init(dev)) | ||
5545 | goto free_pcpu; | ||
5546 | |||
5526 | dev_mc_init(dev); | 5547 | dev_mc_init(dev); |
5527 | dev_uc_init(dev); | 5548 | dev_uc_init(dev); |
5528 | 5549 | ||
@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, | |||
5553 | 5574 | ||
5554 | free_tx: | 5575 | free_tx: |
5555 | kfree(tx); | 5576 | kfree(tx); |
5577 | free_pcpu: | ||
5578 | free_percpu(dev->pcpu_refcnt); | ||
5556 | free_p: | 5579 | free_p: |
5557 | kfree(p); | 5580 | kfree(p); |
5558 | return NULL; | 5581 | return NULL; |
@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev) | |||
5586 | list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) | 5609 | list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) |
5587 | netif_napi_del(p); | 5610 | netif_napi_del(p); |
5588 | 5611 | ||
5612 | free_percpu(dev->pcpu_refcnt); | ||
5613 | dev->pcpu_refcnt = NULL; | ||
5614 | |||
5589 | /* Compatibility with error handling in drivers */ | 5615 | /* Compatibility with error handling in drivers */ |
5590 | if (dev->reg_state == NETREG_UNINITIALIZED) { | 5616 | if (dev->reg_state == NETREG_UNINITIALIZED) { |
5591 | kfree((char *)dev - dev->padded); | 5617 | kfree((char *)dev - dev->padded); |