aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2010-03-16 04:03:29 -0400
committerDavid S. Miller <davem@davemloft.net>2010-03-17 00:23:18 -0400
commit0a9627f2649a02bea165cfd529d7bcb625c2fcad (patch)
treee5d4424b99208c78e2b2fe6ff5a158fc21bdf782 /include/linux
parent768bbedf9ca4cc4784eae2003f37abe0818fe0b0 (diff)
rps: Receive Packet Steering
This patch implements software receive side packet steering (RPS). RPS distributes the load of received packet processing across multiple CPUs. Problem statement: Protocol processing done in the NAPI context for received packets is serialized per device queue and becomes a bottleneck under high packet load. This substantially limits pps that can be achieved on a single queue NIC and provides no scaling with multiple cores. This solution queues packets early on in the receive path on the backlog queues of other CPUs. This allows protocol processing (e.g. IP and TCP) to be performed on packets in parallel. For each device (or each receive queue in a multi-queue device) a mask of CPUs is set to indicate the CPUs that can process packets. A CPU is selected on a per packet basis by hashing contents of the packet header (e.g. the TCP or UDP 4-tuple) and using the result to index into the CPU mask. The IPI mechanism is used to raise networking receive softirqs between CPUs. This effectively emulates in software what a multi-queue NIC can provide, but is generic requiring no device support. Many devices now provide a hash over the 4-tuple on a per packet basis (e.g. the Toeplitz hash). This patch allow drivers to set the HW reported hash in an skb field, and that value in turn is used to index into the RPS maps. Using the HW generated hash can avoid cache misses on the packet when steering it to a remote CPU. The CPU mask is set on a per device and per queue basis in the sysfs variable /sys/class/net/<device>/queues/rx-<n>/rps_cpus. This is a set of canonical bit maps for receive queues in the device (numbered by <n>). If a device does not support multi-queue, a single variable is used for the device (rx-0). Generally, we have found this technique increases pps capabilities of a single queue device with good CPU utilization. Optimal settings for the CPU mask seem to depend on architectures and cache hierarcy. Below are some results running 500 instances of netperf TCP_RR test with 1 byte req. and resp. Results show cumulative transaction rate and system CPU utilization. e1000e on 8 core Intel Without RPS: 108K tps at 33% CPU With RPS: 311K tps at 64% CPU forcedeth on 16 core AMD Without RPS: 156K tps at 15% CPU With RPS: 404K tps at 49% CPU bnx2x on 16 core AMD Without RPS 567K tps at 61% CPU (4 HW RX queues) Without RPS 738K tps at 96% CPU (8 HW RX queues) With RPS: 854K tps at 76% CPU (4 HW RX queues) Caveats: - The benefits of this patch are dependent on architecture and cache hierarchy. Tuning the masks to get best performance is probably necessary. - This patch adds overhead in the path for processing a single packet. In a lightly loaded server this overhead may eliminate the advantages of increased parallelism, and possibly cause some relative performance degradation. We have found that masks that are cache aware (share same caches with the interrupting CPU) mitigate much of this. - The RPS masks can be changed dynamically, however whenever the mask is changed this introduces the possibility of generating out of order packets. It's probably best not change the masks too frequently. Signed-off-by: Tom Herbert <therbert@google.com> include/linux/netdevice.h | 32 ++++- include/linux/skbuff.h | 3 + net/core/dev.c | 335 +++++++++++++++++++++++++++++++++++++-------- net/core/net-sysfs.c | 225 ++++++++++++++++++++++++++++++- net/core/skbuff.c | 2 + 5 files changed, 538 insertions(+), 59 deletions(-) Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/netdevice.h32
-rw-r--r--include/linux/skbuff.h3
2 files changed, 33 insertions, 2 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c79a88be7c33..de1a52bcb9e0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -223,6 +223,7 @@ struct netif_rx_stats {
223 unsigned dropped; 223 unsigned dropped;
224 unsigned time_squeeze; 224 unsigned time_squeeze;
225 unsigned cpu_collision; 225 unsigned cpu_collision;
226 unsigned received_rps;
226}; 227};
227 228
228DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat); 229DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
@@ -530,6 +531,24 @@ struct netdev_queue {
530 unsigned long tx_dropped; 531 unsigned long tx_dropped;
531} ____cacheline_aligned_in_smp; 532} ____cacheline_aligned_in_smp;
532 533
534/*
535 * This structure holds an RPS map which can be of variable length. The
536 * map is an array of CPUs.
537 */
538struct rps_map {
539 unsigned int len;
540 struct rcu_head rcu;
541 u16 cpus[0];
542};
543#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
544
545/* This structure contains an instance of an RX queue. */
546struct netdev_rx_queue {
547 struct rps_map *rps_map;
548 struct kobject kobj;
549 struct netdev_rx_queue *first;
550 atomic_t count;
551} ____cacheline_aligned_in_smp;
533 552
534/* 553/*
535 * This structure defines the management hooks for network devices. 554 * This structure defines the management hooks for network devices.
@@ -878,6 +897,13 @@ struct net_device {
878 897
879 unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ 898 unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
880 899
900 struct kset *queues_kset;
901
902 struct netdev_rx_queue *_rx;
903
904 /* Number of RX queues allocated at alloc_netdev_mq() time */
905 unsigned int num_rx_queues;
906
881 struct netdev_queue rx_queue; 907 struct netdev_queue rx_queue;
882 908
883 struct netdev_queue *_tx ____cacheline_aligned_in_smp; 909 struct netdev_queue *_tx ____cacheline_aligned_in_smp;
@@ -1311,14 +1337,16 @@ static inline int unregister_gifconf(unsigned int family)
1311 */ 1337 */
1312struct softnet_data { 1338struct softnet_data {
1313 struct Qdisc *output_queue; 1339 struct Qdisc *output_queue;
1314 struct sk_buff_head input_pkt_queue;
1315 struct list_head poll_list; 1340 struct list_head poll_list;
1316 struct sk_buff *completion_queue; 1341 struct sk_buff *completion_queue;
1317 1342
1343 /* Elements below can be accessed between CPUs for RPS */
1344 struct call_single_data csd ____cacheline_aligned_in_smp;
1345 struct sk_buff_head input_pkt_queue;
1318 struct napi_struct backlog; 1346 struct napi_struct backlog;
1319}; 1347};
1320 1348
1321DECLARE_PER_CPU(struct softnet_data,softnet_data); 1349DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
1322 1350
1323#define HAVE_NETIF_QUEUE 1351#define HAVE_NETIF_QUEUE
1324 1352
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 03f816a9b659..def10b064f29 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -300,6 +300,7 @@ typedef unsigned char *sk_buff_data_t;
300 * @nfct_reasm: netfilter conntrack re-assembly pointer 300 * @nfct_reasm: netfilter conntrack re-assembly pointer
301 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c 301 * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
302 * @skb_iif: ifindex of device we arrived on 302 * @skb_iif: ifindex of device we arrived on
303 * @rxhash: the packet hash computed on receive
303 * @queue_mapping: Queue mapping for multiqueue devices 304 * @queue_mapping: Queue mapping for multiqueue devices
304 * @tc_index: Traffic control index 305 * @tc_index: Traffic control index
305 * @tc_verd: traffic control verdict 306 * @tc_verd: traffic control verdict
@@ -375,6 +376,8 @@ struct sk_buff {
375#endif 376#endif
376#endif 377#endif
377 378
379 __u32 rxhash;
380
378 kmemcheck_bitfield_begin(flags2); 381 kmemcheck_bitfield_begin(flags2);
379 __u16 queue_mapping:16; 382 __u16 queue_mapping:16;
380#ifdef CONFIG_IPV6_NDISC_NODETYPE 383#ifdef CONFIG_IPV6_NDISC_NODETYPE