aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2010-11-21 08:17:27 -0500
committerDavid S. Miller <davem@davemloft.net>2010-11-24 14:44:20 -0500
commit1d24eb4815d1e0e8b451ecc546645f8ef1176d4f (patch)
tree0172e72b9452dc46c4e1043817005979cec022a7 /net/core
parent3853b5841c01a3f492fe137afaad9c209e5162c6 (diff)
xps: Transmit Packet Steering
This patch implements transmit packet steering (XPS) for multiqueue devices. XPS selects a transmit queue during packet transmission based on configuration. This is done by mapping the CPU transmitting the packet to a queue. This is the transmit side analogue to RPS-- where RPS is selecting a CPU based on receive queue, XPS selects a queue based on the CPU (previously there was an XPS patch from Eric Dumazet, but that might more appropriately be called transmit completion steering). Each transmit queue can be associated with a number of CPUs which will use the queue to send packets. This is configured as a CPU mask on a per queue basis in: /sys/class/net/eth<n>/queues/tx-<n>/xps_cpus The mappings are stored per device in an inverted data structure that maps CPUs to queues. In the netdevice structure this is an array of num_possible_cpu structures where each structure holds and array of queue_indexes for queues which that CPU can use. The benefits of XPS are improved locality in the per queue data structures. Also, transmit completions are more likely to be done nearer to the sending thread, so this should promote locality back to the socket on free (e.g. UDP). The benefits of XPS are dependent on cache hierarchy, application load, and other factors. XPS would nominally be configured so that a queue would only be shared by CPUs which are sharing a cache, the degenerative configuration woud be that each CPU has it's own queue. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. bnx2x on 16 core AMD XPS (16 queues, 1 TX queue per CPU) 1234K at 100% CPU No XPS (16 queues) 996K at 100% CPU Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c53
-rw-r--r--net/core/net-sysfs.c369
-rw-r--r--net/core/net-sysfs.h3
3 files changed, 417 insertions, 8 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 7b17674a29ec..c852f0038a08 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1557,12 +1557,16 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1557 */ 1557 */
1558int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 1558int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1559{ 1559{
1560 int rc;
1561
1560 if (txq < 1 || txq > dev->num_tx_queues) 1562 if (txq < 1 || txq > dev->num_tx_queues)
1561 return -EINVAL; 1563 return -EINVAL;
1562 1564
1563 if (dev->reg_state == NETREG_REGISTERED) { 1565 if (dev->reg_state == NETREG_REGISTERED) {
1564 ASSERT_RTNL(); 1566 ASSERT_RTNL();
1565 1567
1568 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1569 txq);
1566 if (txq < dev->real_num_tx_queues) 1570 if (txq < dev->real_num_tx_queues)
1567 qdisc_reset_all_tx_gt(dev, txq); 1571 qdisc_reset_all_tx_gt(dev, txq);
1568 } 1572 }
@@ -2142,6 +2146,44 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2142 return queue_index; 2146 return queue_index;
2143} 2147}
2144 2148
2149static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2150{
2151#ifdef CONFIG_RPS
2152 struct xps_dev_maps *dev_maps;
2153 struct xps_map *map;
2154 int queue_index = -1;
2155
2156 rcu_read_lock();
2157 dev_maps = rcu_dereference(dev->xps_maps);
2158 if (dev_maps) {
2159 map = rcu_dereference(
2160 dev_maps->cpu_map[raw_smp_processor_id()]);
2161 if (map) {
2162 if (map->len == 1)
2163 queue_index = map->queues[0];
2164 else {
2165 u32 hash;
2166 if (skb->sk && skb->sk->sk_hash)
2167 hash = skb->sk->sk_hash;
2168 else
2169 hash = (__force u16) skb->protocol ^
2170 skb->rxhash;
2171 hash = jhash_1word(hash, hashrnd);
2172 queue_index = map->queues[
2173 ((u64)hash * map->len) >> 32];
2174 }
2175 if (unlikely(queue_index >= dev->real_num_tx_queues))
2176 queue_index = -1;
2177 }
2178 }
2179 rcu_read_unlock();
2180
2181 return queue_index;
2182#else
2183 return -1;
2184#endif
2185}
2186
2145static struct netdev_queue *dev_pick_tx(struct net_device *dev, 2187static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2146 struct sk_buff *skb) 2188 struct sk_buff *skb)
2147{ 2189{
@@ -2161,7 +2203,9 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2161 queue_index >= dev->real_num_tx_queues) { 2203 queue_index >= dev->real_num_tx_queues) {
2162 int old_index = queue_index; 2204 int old_index = queue_index;
2163 2205
2164 queue_index = skb_tx_hash(dev, skb); 2206 queue_index = get_xps_queue(dev, skb);
2207 if (queue_index < 0)
2208 queue_index = skb_tx_hash(dev, skb);
2165 2209
2166 if (queue_index != old_index && sk) { 2210 if (queue_index != old_index && sk) {
2167 struct dst_entry *dst = 2211 struct dst_entry *dst =
@@ -5066,6 +5110,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
5066{ 5110{
5067 unsigned int count = dev->num_tx_queues; 5111 unsigned int count = dev->num_tx_queues;
5068 struct netdev_queue *tx; 5112 struct netdev_queue *tx;
5113 int i;
5069 5114
5070 BUG_ON(count < 1); 5115 BUG_ON(count < 1);
5071 5116
@@ -5076,6 +5121,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
5076 return -ENOMEM; 5121 return -ENOMEM;
5077 } 5122 }
5078 dev->_tx = tx; 5123 dev->_tx = tx;
5124
5125 for (i = 0; i < count; i++)
5126 tx[i].dev = dev;
5127
5079 return 0; 5128 return 0;
5080} 5129}
5081 5130
@@ -5083,8 +5132,6 @@ static void netdev_init_one_queue(struct net_device *dev,
5083 struct netdev_queue *queue, 5132 struct netdev_queue *queue,
5084 void *_unused) 5133 void *_unused)
5085{ 5134{
5086 queue->dev = dev;
5087
5088 /* Initialize queue lock */ 5135 /* Initialize queue lock */
5089 spin_lock_init(&queue->_xmit_lock); 5136 spin_lock_init(&queue->_xmit_lock);
5090 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 5137 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7abeb7ceaa4c..68dbbfdee274 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -772,18 +772,377 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
772 return error; 772 return error;
773} 773}
774 774
775static int rx_queue_register_kobjects(struct net_device *net) 775/*
776 * netdev_queue sysfs structures and functions.
777 */
778struct netdev_queue_attribute {
779 struct attribute attr;
780 ssize_t (*show)(struct netdev_queue *queue,
781 struct netdev_queue_attribute *attr, char *buf);
782 ssize_t (*store)(struct netdev_queue *queue,
783 struct netdev_queue_attribute *attr, const char *buf, size_t len);
784};
785#define to_netdev_queue_attr(_attr) container_of(_attr, \
786 struct netdev_queue_attribute, attr)
787
788#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
789
790static ssize_t netdev_queue_attr_show(struct kobject *kobj,
791 struct attribute *attr, char *buf)
792{
793 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
794 struct netdev_queue *queue = to_netdev_queue(kobj);
795
796 if (!attribute->show)
797 return -EIO;
798
799 return attribute->show(queue, attribute, buf);
800}
801
802static ssize_t netdev_queue_attr_store(struct kobject *kobj,
803 struct attribute *attr,
804 const char *buf, size_t count)
805{
806 struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
807 struct netdev_queue *queue = to_netdev_queue(kobj);
808
809 if (!attribute->store)
810 return -EIO;
811
812 return attribute->store(queue, attribute, buf, count);
813}
814
815static const struct sysfs_ops netdev_queue_sysfs_ops = {
816 .show = netdev_queue_attr_show,
817 .store = netdev_queue_attr_store,
818};
819
820static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
776{ 821{
822 struct net_device *dev = queue->dev;
823 int i;
824
825 for (i = 0; i < dev->num_tx_queues; i++)
826 if (queue == &dev->_tx[i])
827 break;
828
829 BUG_ON(i >= dev->num_tx_queues);
830
831 return i;
832}
833
834
835static ssize_t show_xps_map(struct netdev_queue *queue,
836 struct netdev_queue_attribute *attribute, char *buf)
837{
838 struct net_device *dev = queue->dev;
839 struct xps_dev_maps *dev_maps;
840 cpumask_var_t mask;
841 unsigned long index;
842 size_t len = 0;
843 int i;
844
845 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
846 return -ENOMEM;
847
848 index = get_netdev_queue_index(queue);
849
850 rcu_read_lock();
851 dev_maps = rcu_dereference(dev->xps_maps);
852 if (dev_maps) {
853 for_each_possible_cpu(i) {
854 struct xps_map *map =
855 rcu_dereference(dev_maps->cpu_map[i]);
856 if (map) {
857 int j;
858 for (j = 0; j < map->len; j++) {
859 if (map->queues[j] == index) {
860 cpumask_set_cpu(i, mask);
861 break;
862 }
863 }
864 }
865 }
866 }
867 rcu_read_unlock();
868
869 len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
870 if (PAGE_SIZE - len < 3) {
871 free_cpumask_var(mask);
872 return -EINVAL;
873 }
874
875 free_cpumask_var(mask);
876 len += sprintf(buf + len, "\n");
877 return len;
878}
879
880static void xps_map_release(struct rcu_head *rcu)
881{
882 struct xps_map *map = container_of(rcu, struct xps_map, rcu);
883
884 kfree(map);
885}
886
887static void xps_dev_maps_release(struct rcu_head *rcu)
888{
889 struct xps_dev_maps *dev_maps =
890 container_of(rcu, struct xps_dev_maps, rcu);
891
892 kfree(dev_maps);
893}
894
895static DEFINE_MUTEX(xps_map_mutex);
896
897static ssize_t store_xps_map(struct netdev_queue *queue,
898 struct netdev_queue_attribute *attribute,
899 const char *buf, size_t len)
900{
901 struct net_device *dev = queue->dev;
902 cpumask_var_t mask;
903 int err, i, cpu, pos, map_len, alloc_len, need_set;
904 unsigned long index;
905 struct xps_map *map, *new_map;
906 struct xps_dev_maps *dev_maps, *new_dev_maps;
907 int nonempty = 0;
908
909 if (!capable(CAP_NET_ADMIN))
910 return -EPERM;
911
912 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
913 return -ENOMEM;
914
915 index = get_netdev_queue_index(queue);
916
917 err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
918 if (err) {
919 free_cpumask_var(mask);
920 return err;
921 }
922
923 new_dev_maps = kzalloc(max_t(unsigned,
924 XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL);
925 if (!new_dev_maps) {
926 free_cpumask_var(mask);
927 return -ENOMEM;
928 }
929
930 mutex_lock(&xps_map_mutex);
931
932 dev_maps = dev->xps_maps;
933
934 for_each_possible_cpu(cpu) {
935 new_map = map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
936
937 if (map) {
938 for (pos = 0; pos < map->len; pos++)
939 if (map->queues[pos] == index)
940 break;
941 map_len = map->len;
942 alloc_len = map->alloc_len;
943 } else
944 pos = map_len = alloc_len = 0;
945
946 need_set = cpu_isset(cpu, *mask) && cpu_online(cpu);
947
948 if (need_set && pos >= map_len) {
949 /* Need to add queue to this CPU's map */
950 if (map_len >= alloc_len) {
951 alloc_len = alloc_len ?
952 2 * alloc_len : XPS_MIN_MAP_ALLOC;
953 new_map = kzalloc(XPS_MAP_SIZE(alloc_len),
954 GFP_KERNEL);
955 if (!new_map)
956 goto error;
957 new_map->alloc_len = alloc_len;
958 for (i = 0; i < map_len; i++)
959 new_map->queues[i] = map->queues[i];
960 new_map->len = map_len;
961 }
962 new_map->queues[new_map->len++] = index;
963 } else if (!need_set && pos < map_len) {
964 /* Need to remove queue from this CPU's map */
965 if (map_len > 1)
966 new_map->queues[pos] =
967 new_map->queues[--new_map->len];
968 else
969 new_map = NULL;
970 }
971 new_dev_maps->cpu_map[cpu] = new_map;
972 }
973
974 /* Cleanup old maps */
975 for_each_possible_cpu(cpu) {
976 map = dev_maps ? dev_maps->cpu_map[cpu] : NULL;
977 if (map && new_dev_maps->cpu_map[cpu] != map)
978 call_rcu(&map->rcu, xps_map_release);
979 if (new_dev_maps->cpu_map[cpu])
980 nonempty = 1;
981 }
982
983 if (nonempty)
984 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
985 else {
986 kfree(new_dev_maps);
987 rcu_assign_pointer(dev->xps_maps, NULL);
988 }
989
990 if (dev_maps)
991 call_rcu(&dev_maps->rcu, xps_dev_maps_release);
992
993 mutex_unlock(&xps_map_mutex);
994
995 free_cpumask_var(mask);
996 return len;
997
998error:
999 mutex_unlock(&xps_map_mutex);
1000
1001 if (new_dev_maps)
1002 for_each_possible_cpu(i)
1003 kfree(new_dev_maps->cpu_map[i]);
1004 kfree(new_dev_maps);
1005 free_cpumask_var(mask);
1006 return -ENOMEM;
1007}
1008
1009static struct netdev_queue_attribute xps_cpus_attribute =
1010 __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
1011
1012static struct attribute *netdev_queue_default_attrs[] = {
1013 &xps_cpus_attribute.attr,
1014 NULL
1015};
1016
1017static void netdev_queue_release(struct kobject *kobj)
1018{
1019 struct netdev_queue *queue = to_netdev_queue(kobj);
1020 struct net_device *dev = queue->dev;
1021 struct xps_dev_maps *dev_maps;
1022 struct xps_map *map;
1023 unsigned long index;
1024 int i, pos, nonempty = 0;
1025
1026 index = get_netdev_queue_index(queue);
1027
1028 mutex_lock(&xps_map_mutex);
1029 dev_maps = dev->xps_maps;
1030
1031 if (dev_maps) {
1032 for_each_possible_cpu(i) {
1033 map = dev_maps->cpu_map[i];
1034 if (!map)
1035 continue;
1036
1037 for (pos = 0; pos < map->len; pos++)
1038 if (map->queues[pos] == index)
1039 break;
1040
1041 if (pos < map->len) {
1042 if (map->len > 1)
1043 map->queues[pos] =
1044 map->queues[--map->len];
1045 else {
1046 RCU_INIT_POINTER(dev_maps->cpu_map[i],
1047 NULL);
1048 call_rcu(&map->rcu, xps_map_release);
1049 map = NULL;
1050 }
1051 }
1052 if (map)
1053 nonempty = 1;
1054 }
1055
1056 if (!nonempty) {
1057 RCU_INIT_POINTER(dev->xps_maps, NULL);
1058 call_rcu(&dev_maps->rcu, xps_dev_maps_release);
1059 }
1060 }
1061
1062 mutex_unlock(&xps_map_mutex);
1063
1064 memset(kobj, 0, sizeof(*kobj));
1065 dev_put(queue->dev);
1066}
1067
1068static struct kobj_type netdev_queue_ktype = {
1069 .sysfs_ops = &netdev_queue_sysfs_ops,
1070 .release = netdev_queue_release,
1071 .default_attrs = netdev_queue_default_attrs,
1072};
1073
1074static int netdev_queue_add_kobject(struct net_device *net, int index)
1075{
1076 struct netdev_queue *queue = net->_tx + index;
1077 struct kobject *kobj = &queue->kobj;
1078 int error = 0;
1079
1080 kobj->kset = net->queues_kset;
1081 error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
1082 "tx-%u", index);
1083 if (error) {
1084 kobject_put(kobj);
1085 return error;
1086 }
1087
1088 kobject_uevent(kobj, KOBJ_ADD);
1089 dev_hold(queue->dev);
1090
1091 return error;
1092}
1093
1094int
1095netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
1096{
1097 int i;
1098 int error = 0;
1099
1100 for (i = old_num; i < new_num; i++) {
1101 error = netdev_queue_add_kobject(net, i);
1102 if (error) {
1103 new_num = old_num;
1104 break;
1105 }
1106 }
1107
1108 while (--i >= new_num)
1109 kobject_put(&net->_tx[i].kobj);
1110
1111 return error;
1112}
1113
1114static int register_queue_kobjects(struct net_device *net)
1115{
1116 int error = 0, txq = 0, rxq = 0;
1117
777 net->queues_kset = kset_create_and_add("queues", 1118 net->queues_kset = kset_create_and_add("queues",
778 NULL, &net->dev.kobj); 1119 NULL, &net->dev.kobj);
779 if (!net->queues_kset) 1120 if (!net->queues_kset)
780 return -ENOMEM; 1121 return -ENOMEM;
781 return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues); 1122
1123 error = net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
1124 if (error)
1125 goto error;
1126 rxq = net->real_num_rx_queues;
1127
1128 error = netdev_queue_update_kobjects(net, 0,
1129 net->real_num_tx_queues);
1130 if (error)
1131 goto error;
1132 txq = net->real_num_tx_queues;
1133
1134 return 0;
1135
1136error:
1137 netdev_queue_update_kobjects(net, txq, 0);
1138 net_rx_queue_update_kobjects(net, rxq, 0);
1139 return error;
782} 1140}
783 1141
784static void rx_queue_remove_kobjects(struct net_device *net) 1142static void remove_queue_kobjects(struct net_device *net)
785{ 1143{
786 net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0); 1144 net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0);
1145 netdev_queue_update_kobjects(net, net->real_num_tx_queues, 0);
787 kset_unregister(net->queues_kset); 1146 kset_unregister(net->queues_kset);
788} 1147}
789#endif /* CONFIG_RPS */ 1148#endif /* CONFIG_RPS */
@@ -886,7 +1245,7 @@ void netdev_unregister_kobject(struct net_device * net)
886 kobject_get(&dev->kobj); 1245 kobject_get(&dev->kobj);
887 1246
888#ifdef CONFIG_RPS 1247#ifdef CONFIG_RPS
889 rx_queue_remove_kobjects(net); 1248 remove_queue_kobjects(net);
890#endif 1249#endif
891 1250
892 device_del(dev); 1251 device_del(dev);
@@ -927,7 +1286,7 @@ int netdev_register_kobject(struct net_device *net)
927 return error; 1286 return error;
928 1287
929#ifdef CONFIG_RPS 1288#ifdef CONFIG_RPS
930 error = rx_queue_register_kobjects(net); 1289 error = register_queue_kobjects(net);
931 if (error) { 1290 if (error) {
932 device_del(dev); 1291 device_del(dev);
933 return error; 1292 return error;
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 778e1571548d..25ec2ee57df7 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -6,6 +6,9 @@ int netdev_register_kobject(struct net_device *);
6void netdev_unregister_kobject(struct net_device *); 6void netdev_unregister_kobject(struct net_device *);
7#ifdef CONFIG_RPS 7#ifdef CONFIG_RPS
8int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); 8int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
9int netdev_queue_update_kobjects(struct net_device *net,
10 int old_num, int new_num);
11
9#endif 12#endif
10 13
11#endif 14#endif