aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/net-sysfs.c
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2010-03-16 04:03:29 -0400
committerDavid S. Miller <davem@davemloft.net>2010-03-17 00:23:18 -0400
commit0a9627f2649a02bea165cfd529d7bcb625c2fcad (patch)
treee5d4424b99208c78e2b2fe6ff5a158fc21bdf782 /net/core/net-sysfs.c
parent768bbedf9ca4cc4784eae2003f37abe0818fe0b0 (diff)
rps: Receive Packet Steering
This patch implements software receive side packet steering (RPS). RPS distributes the load of received packet processing across multiple CPUs. Problem statement: Protocol processing done in the NAPI context for received packets is serialized per device queue and becomes a bottleneck under high packet load. This substantially limits pps that can be achieved on a single queue NIC and provides no scaling with multiple cores. This solution queues packets early on in the receive path on the backlog queues of other CPUs. This allows protocol processing (e.g. IP and TCP) to be performed on packets in parallel. For each device (or each receive queue in a multi-queue device) a mask of CPUs is set to indicate the CPUs that can process packets. A CPU is selected on a per packet basis by hashing contents of the packet header (e.g. the TCP or UDP 4-tuple) and using the result to index into the CPU mask. The IPI mechanism is used to raise networking receive softirqs between CPUs. This effectively emulates in software what a multi-queue NIC can provide, but is generic requiring no device support. Many devices now provide a hash over the 4-tuple on a per packet basis (e.g. the Toeplitz hash). This patch allow drivers to set the HW reported hash in an skb field, and that value in turn is used to index into the RPS maps. Using the HW generated hash can avoid cache misses on the packet when steering it to a remote CPU. The CPU mask is set on a per device and per queue basis in the sysfs variable /sys/class/net/<device>/queues/rx-<n>/rps_cpus. This is a set of canonical bit maps for receive queues in the device (numbered by <n>). If a device does not support multi-queue, a single variable is used for the device (rx-0). Generally, we have found this technique increases pps capabilities of a single queue device with good CPU utilization. Optimal settings for the CPU mask seem to depend on architectures and cache hierarcy. Below are some results running 500 instances of netperf TCP_RR test with 1 byte req. and resp. Results show cumulative transaction rate and system CPU utilization. e1000e on 8 core Intel Without RPS: 108K tps at 33% CPU With RPS: 311K tps at 64% CPU forcedeth on 16 core AMD Without RPS: 156K tps at 15% CPU With RPS: 404K tps at 49% CPU bnx2x on 16 core AMD Without RPS 567K tps at 61% CPU (4 HW RX queues) Without RPS 738K tps at 96% CPU (8 HW RX queues) With RPS: 854K tps at 76% CPU (4 HW RX queues) Caveats: - The benefits of this patch are dependent on architecture and cache hierarchy. Tuning the masks to get best performance is probably necessary. - This patch adds overhead in the path for processing a single packet. In a lightly loaded server this overhead may eliminate the advantages of increased parallelism, and possibly cause some relative performance degradation. We have found that masks that are cache aware (share same caches with the interrupting CPU) mitigate much of this. - The RPS masks can be changed dynamically, however whenever the mask is changed this introduces the possibility of generating out of order packets. It's probably best not change the masks too frequently. Signed-off-by: Tom Herbert <therbert@google.com> include/linux/netdevice.h | 32 ++++- include/linux/skbuff.h | 3 + net/core/dev.c | 335 +++++++++++++++++++++++++++++++++++++-------- net/core/net-sysfs.c | 225 ++++++++++++++++++++++++++++++- net/core/skbuff.c | 2 + 5 files changed, 538 insertions(+), 59 deletions(-) Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/net-sysfs.c')
-rw-r--r--net/core/net-sysfs.c225
1 files changed, 224 insertions, 1 deletions
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 099c753c4213..7a46343d5ae3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -466,6 +466,216 @@ static struct attribute_group wireless_group = {
466}; 466};
467#endif 467#endif
468 468
469/*
470 * RX queue sysfs structures and functions.
471 */
472struct rx_queue_attribute {
473 struct attribute attr;
474 ssize_t (*show)(struct netdev_rx_queue *queue,
475 struct rx_queue_attribute *attr, char *buf);
476 ssize_t (*store)(struct netdev_rx_queue *queue,
477 struct rx_queue_attribute *attr, const char *buf, size_t len);
478};
479#define to_rx_queue_attr(_attr) container_of(_attr, \
480 struct rx_queue_attribute, attr)
481
482#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
483
484static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
485 char *buf)
486{
487 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
488 struct netdev_rx_queue *queue = to_rx_queue(kobj);
489
490 if (!attribute->show)
491 return -EIO;
492
493 return attribute->show(queue, attribute, buf);
494}
495
496static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
497 const char *buf, size_t count)
498{
499 struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
500 struct netdev_rx_queue *queue = to_rx_queue(kobj);
501
502 if (!attribute->store)
503 return -EIO;
504
505 return attribute->store(queue, attribute, buf, count);
506}
507
508static struct sysfs_ops rx_queue_sysfs_ops = {
509 .show = rx_queue_attr_show,
510 .store = rx_queue_attr_store,
511};
512
513static ssize_t show_rps_map(struct netdev_rx_queue *queue,
514 struct rx_queue_attribute *attribute, char *buf)
515{
516 struct rps_map *map;
517 cpumask_var_t mask;
518 size_t len = 0;
519 int i;
520
521 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
522 return -ENOMEM;
523
524 rcu_read_lock();
525 map = rcu_dereference(queue->rps_map);
526 if (map)
527 for (i = 0; i < map->len; i++)
528 cpumask_set_cpu(map->cpus[i], mask);
529
530 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
531 if (PAGE_SIZE - len < 3) {
532 rcu_read_unlock();
533 free_cpumask_var(mask);
534 return -EINVAL;
535 }
536 rcu_read_unlock();
537
538 free_cpumask_var(mask);
539 len += sprintf(buf + len, "\n");
540 return len;
541}
542
543static void rps_map_release(struct rcu_head *rcu)
544{
545 struct rps_map *map = container_of(rcu, struct rps_map, rcu);
546
547 kfree(map);
548}
549
550ssize_t store_rps_map(struct netdev_rx_queue *queue,
551 struct rx_queue_attribute *attribute,
552 const char *buf, size_t len)
553{
554 struct rps_map *old_map, *map;
555 cpumask_var_t mask;
556 int err, cpu, i;
557 static DEFINE_SPINLOCK(rps_map_lock);
558
559 if (!capable(CAP_NET_ADMIN))
560 return -EPERM;
561
562 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
563 return -ENOMEM;
564
565 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
566 if (err) {
567 free_cpumask_var(mask);
568 return err;
569 }
570
571 map = kzalloc(max_t(unsigned,
572 RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
573 GFP_KERNEL);
574 if (!map) {
575 free_cpumask_var(mask);
576 return -ENOMEM;
577 }
578
579 i = 0;
580 for_each_cpu_and(cpu, mask, cpu_online_mask)
581 map->cpus[i++] = cpu;
582
583 if (i)
584 map->len = i;
585 else {
586 kfree(map);
587 map = NULL;
588 }
589
590 spin_lock(&rps_map_lock);
591 old_map = queue->rps_map;
592 rcu_assign_pointer(queue->rps_map, map);
593 spin_unlock(&rps_map_lock);
594
595 if (old_map)
596 call_rcu(&old_map->rcu, rps_map_release);
597
598 free_cpumask_var(mask);
599 return len;
600}
601
602static struct rx_queue_attribute rps_cpus_attribute =
603 __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
604
605static struct attribute *rx_queue_default_attrs[] = {
606 &rps_cpus_attribute.attr,
607 NULL
608};
609
610static void rx_queue_release(struct kobject *kobj)
611{
612 struct netdev_rx_queue *queue = to_rx_queue(kobj);
613 struct rps_map *map = queue->rps_map;
614 struct netdev_rx_queue *first = queue->first;
615
616 if (map)
617 call_rcu(&map->rcu, rps_map_release);
618
619 if (atomic_dec_and_test(&first->count))
620 kfree(first);
621}
622
623static struct kobj_type rx_queue_ktype = {
624 .sysfs_ops = &rx_queue_sysfs_ops,
625 .release = rx_queue_release,
626 .default_attrs = rx_queue_default_attrs,
627};
628
629static int rx_queue_add_kobject(struct net_device *net, int index)
630{
631 struct netdev_rx_queue *queue = net->_rx + index;
632 struct kobject *kobj = &queue->kobj;
633 int error = 0;
634
635 kobj->kset = net->queues_kset;
636 error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
637 "rx-%u", index);
638 if (error) {
639 kobject_put(kobj);
640 return error;
641 }
642
643 kobject_uevent(kobj, KOBJ_ADD);
644
645 return error;
646}
647
648static int rx_queue_register_kobjects(struct net_device *net)
649{
650 int i;
651 int error = 0;
652
653 net->queues_kset = kset_create_and_add("queues",
654 NULL, &net->dev.kobj);
655 if (!net->queues_kset)
656 return -ENOMEM;
657 for (i = 0; i < net->num_rx_queues; i++) {
658 error = rx_queue_add_kobject(net, i);
659 if (error)
660 break;
661 }
662
663 if (error)
664 while (--i >= 0)
665 kobject_put(&net->_rx[i].kobj);
666
667 return error;
668}
669
670static void rx_queue_remove_kobjects(struct net_device *net)
671{
672 int i;
673
674 for (i = 0; i < net->num_rx_queues; i++)
675 kobject_put(&net->_rx[i].kobj);
676 kset_unregister(net->queues_kset);
677}
678
469#endif /* CONFIG_SYSFS */ 679#endif /* CONFIG_SYSFS */
470 680
471#ifdef CONFIG_HOTPLUG 681#ifdef CONFIG_HOTPLUG
@@ -529,6 +739,8 @@ void netdev_unregister_kobject(struct net_device * net)
529 if (!net_eq(dev_net(net), &init_net)) 739 if (!net_eq(dev_net(net), &init_net))
530 return; 740 return;
531 741
742 rx_queue_remove_kobjects(net);
743
532 device_del(dev); 744 device_del(dev);
533} 745}
534 746
@@ -537,6 +749,7 @@ int netdev_register_kobject(struct net_device *net)
537{ 749{
538 struct device *dev = &(net->dev); 750 struct device *dev = &(net->dev);
539 const struct attribute_group **groups = net->sysfs_groups; 751 const struct attribute_group **groups = net->sysfs_groups;
752 int error = 0;
540 753
541 dev->class = &net_class; 754 dev->class = &net_class;
542 dev->platform_data = net; 755 dev->platform_data = net;
@@ -563,7 +776,17 @@ int netdev_register_kobject(struct net_device *net)
563 if (!net_eq(dev_net(net), &init_net)) 776 if (!net_eq(dev_net(net), &init_net))
564 return 0; 777 return 0;
565 778
566 return device_add(dev); 779 error = device_add(dev);
780 if (error)
781 return error;
782
783 error = rx_queue_register_kobjects(net);
784 if (error) {
785 device_del(dev);
786 return error;
787 }
788
789 return error;
567} 790}
568 791
569int netdev_class_create_file(struct class_attribute *class_attr) 792int netdev_class_create_file(struct class_attribute *class_attr)