aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2010-10-11 06:22:12 -0400
committerDavid S. Miller <davem@davemloft.net>2010-10-12 15:35:25 -0400
commit29b4433d991c88d86ca48a4c1cc33c671475be4b (patch)
tree2ad21b86aab8193c4533820c40cd31af97a7377f
parentf0b9f4725180ea58c8da78b3de0b4e0ad180fc2c (diff)
net: percpu net_device refcount
We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/infiniband/hw/nes/nes_cm.c4
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c4
-rw-r--r--include/linux/netdevice.h7
-rw-r--r--net/core/dev.c40
4 files changed, 41 insertions, 14 deletions
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 61e0efd4ccfb..6220d9d75b58 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -2701,7 +2701,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
2701 nesibdev = nesvnic->nesibdev; 2701 nesibdev = nesvnic->nesibdev;
2702 2702
2703 nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", 2703 nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
2704 atomic_read(&nesvnic->netdev->refcnt)); 2704 netdev_refcnt_read(nesvnic->netdev));
2705 2705
2706 if (nesqp->active_conn) { 2706 if (nesqp->active_conn) {
2707 2707
@@ -2791,7 +2791,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
2791 atomic_inc(&cm_accepts); 2791 atomic_inc(&cm_accepts);
2792 2792
2793 nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n", 2793 nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
2794 atomic_read(&nesvnic->netdev->refcnt)); 2794 netdev_refcnt_read(nesvnic->netdev));
2795 2795
2796 /* allocate the ietf frame and space for private data */ 2796 /* allocate the ietf frame and space for private data */
2797 nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev, 2797 nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 9046e6675686..546fc22405fe 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -785,7 +785,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
785 785
786 nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n", 786 nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
787 nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context, 787 nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
788 atomic_read(&nesvnic->netdev->refcnt)); 788 netdev_refcnt_read(nesvnic->netdev));
789 789
790 err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, 790 err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
791 nesadapter->max_pd, &pd_num, &nesadapter->next_pd); 791 nesadapter->max_pd, &pd_num, &nesadapter->next_pd);
@@ -1416,7 +1416,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
1416 /* update the QP table */ 1416 /* update the QP table */
1417 nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp; 1417 nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
1418 nes_debug(NES_DBG_QP, "netdev refcnt=%u\n", 1418 nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
1419 atomic_read(&nesvnic->netdev->refcnt)); 1419 netdev_refcnt_read(nesvnic->netdev));
1420 1420
1421 return &nesqp->ibqp; 1421 return &nesqp->ibqp;
1422} 1422}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4160db3721ba..14fbb04c459d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1026,7 +1026,7 @@ struct net_device {
1026 struct timer_list watchdog_timer; 1026 struct timer_list watchdog_timer;
1027 1027
1028 /* Number of references to this device */ 1028 /* Number of references to this device */
1029 atomic_t refcnt ____cacheline_aligned_in_smp; 1029 int __percpu *pcpu_refcnt;
1030 1030
1031 /* delayed register/unregister */ 1031 /* delayed register/unregister */
1032 struct list_head todo_list; 1032 struct list_head todo_list;
@@ -1330,6 +1330,7 @@ static inline void unregister_netdevice(struct net_device *dev)
1330 unregister_netdevice_queue(dev, NULL); 1330 unregister_netdevice_queue(dev, NULL);
1331} 1331}
1332 1332
1333extern int netdev_refcnt_read(const struct net_device *dev);
1333extern void free_netdev(struct net_device *dev); 1334extern void free_netdev(struct net_device *dev);
1334extern void synchronize_net(void); 1335extern void synchronize_net(void);
1335extern int register_netdevice_notifier(struct notifier_block *nb); 1336extern int register_netdevice_notifier(struct notifier_block *nb);
@@ -1798,7 +1799,7 @@ extern void netdev_run_todo(void);
1798 */ 1799 */
1799static inline void dev_put(struct net_device *dev) 1800static inline void dev_put(struct net_device *dev)
1800{ 1801{
1801 atomic_dec(&dev->refcnt); 1802 irqsafe_cpu_dec(*dev->pcpu_refcnt);
1802} 1803}
1803 1804
1804/** 1805/**
@@ -1809,7 +1810,7 @@ static inline void dev_put(struct net_device *dev)
1809 */ 1810 */
1810static inline void dev_hold(struct net_device *dev) 1811static inline void dev_hold(struct net_device *dev)
1811{ 1812{
1812 atomic_inc(&dev->refcnt); 1813 irqsafe_cpu_inc(*dev->pcpu_refcnt);
1813} 1814}
1814 1815
1815/* Carrier loss detection, dial on demand. The functions netif_carrier_on 1816/* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/core/dev.c b/net/core/dev.c
index 193eafaabd88..04972a4783e2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev)
5192 */ 5192 */
5193 dev->reg_state = NETREG_DUMMY; 5193 dev->reg_state = NETREG_DUMMY;
5194 5194
5195 /* initialize the ref count */
5196 atomic_set(&dev->refcnt, 1);
5197
5198 /* NAPI wants this */ 5195 /* NAPI wants this */
5199 INIT_LIST_HEAD(&dev->napi_list); 5196 INIT_LIST_HEAD(&dev->napi_list);
5200 5197
@@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev)
5202 set_bit(__LINK_STATE_PRESENT, &dev->state); 5199 set_bit(__LINK_STATE_PRESENT, &dev->state);
5203 set_bit(__LINK_STATE_START, &dev->state); 5200 set_bit(__LINK_STATE_START, &dev->state);
5204 5201
5202 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5203 * because users of this 'device' dont need to change
5204 * its refcount.
5205 */
5206
5205 return 0; 5207 return 0;
5206} 5208}
5207EXPORT_SYMBOL_GPL(init_dummy_netdev); 5209EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5243,6 +5245,16 @@ out:
5243} 5245}
5244EXPORT_SYMBOL(register_netdev); 5246EXPORT_SYMBOL(register_netdev);
5245 5247
5248int netdev_refcnt_read(const struct net_device *dev)
5249{
5250 int i, refcnt = 0;
5251
5252 for_each_possible_cpu(i)
5253 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5254 return refcnt;
5255}
5256EXPORT_SYMBOL(netdev_refcnt_read);
5257
5246/* 5258/*
5247 * netdev_wait_allrefs - wait until all references are gone. 5259 * netdev_wait_allrefs - wait until all references are gone.
5248 * 5260 *
@@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev);
5257static void netdev_wait_allrefs(struct net_device *dev) 5269static void netdev_wait_allrefs(struct net_device *dev)
5258{ 5270{
5259 unsigned long rebroadcast_time, warning_time; 5271 unsigned long rebroadcast_time, warning_time;
5272 int refcnt;
5260 5273
5261 linkwatch_forget_dev(dev); 5274 linkwatch_forget_dev(dev);
5262 5275
5263 rebroadcast_time = warning_time = jiffies; 5276 rebroadcast_time = warning_time = jiffies;
5264 while (atomic_read(&dev->refcnt) != 0) { 5277 refcnt = netdev_refcnt_read(dev);
5278
5279 while (refcnt != 0) {
5265 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 5280 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5266 rtnl_lock(); 5281 rtnl_lock();
5267 5282
@@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
5288 5303
5289 msleep(250); 5304 msleep(250);
5290 5305
5306 refcnt = netdev_refcnt_read(dev);
5307
5291 if (time_after(jiffies, warning_time + 10 * HZ)) { 5308 if (time_after(jiffies, warning_time + 10 * HZ)) {
5292 printk(KERN_EMERG "unregister_netdevice: " 5309 printk(KERN_EMERG "unregister_netdevice: "
5293 "waiting for %s to become free. Usage " 5310 "waiting for %s to become free. Usage "
5294 "count = %d\n", 5311 "count = %d\n",
5295 dev->name, atomic_read(&dev->refcnt)); 5312 dev->name, refcnt);
5296 warning_time = jiffies; 5313 warning_time = jiffies;
5297 } 5314 }
5298 } 5315 }
@@ -5350,7 +5367,7 @@ void netdev_run_todo(void)
5350 netdev_wait_allrefs(dev); 5367 netdev_wait_allrefs(dev);
5351 5368
5352 /* paranoia */ 5369 /* paranoia */
5353 BUG_ON(atomic_read(&dev->refcnt)); 5370 BUG_ON(netdev_refcnt_read(dev));
5354 WARN_ON(rcu_dereference_raw(dev->ip_ptr)); 5371 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5355 WARN_ON(dev->ip6_ptr); 5372 WARN_ON(dev->ip6_ptr);
5356 WARN_ON(dev->dn_ptr); 5373 WARN_ON(dev->dn_ptr);
@@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5520 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5537 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5521 dev->padded = (char *)dev - (char *)p; 5538 dev->padded = (char *)dev - (char *)p;
5522 5539
5523 if (dev_addr_init(dev)) 5540 dev->pcpu_refcnt = alloc_percpu(int);
5541 if (!dev->pcpu_refcnt)
5524 goto free_tx; 5542 goto free_tx;
5525 5543
5544 if (dev_addr_init(dev))
5545 goto free_pcpu;
5546
5526 dev_mc_init(dev); 5547 dev_mc_init(dev);
5527 dev_uc_init(dev); 5548 dev_uc_init(dev);
5528 5549
@@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5553 5574
5554free_tx: 5575free_tx:
5555 kfree(tx); 5576 kfree(tx);
5577free_pcpu:
5578 free_percpu(dev->pcpu_refcnt);
5556free_p: 5579free_p:
5557 kfree(p); 5580 kfree(p);
5558 return NULL; 5581 return NULL;
@@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev)
5586 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5609 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5587 netif_napi_del(p); 5610 netif_napi_del(p);
5588 5611
5612 free_percpu(dev->pcpu_refcnt);
5613 dev->pcpu_refcnt = NULL;
5614
5589 /* Compatibility with error handling in drivers */ 5615 /* Compatibility with error handling in drivers */
5590 if (dev->reg_state == NETREG_UNINITIALIZED) { 5616 if (dev->reg_state == NETREG_UNINITIALIZED) {
5591 kfree((char *)dev - dev->padded); 5617 kfree((char *)dev - dev->padded);