rfs: Receive Flow Steering

This patch implements receive flow steering (RFS). RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running. RFS is an extension of Receive Packet Steering (RPS). The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure. The rxhash is passed in skb's received on the connection from netif_receive_skb. For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms. The convolution of the simple approach is that it would potentially allow OOO packets. If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter. To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table. rps_sock_table is a global hash table. Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above. This table contains the "desired" CPUs for flows. rps_dev_flow_table is specific to each device queue. Each entry contains a CPU and a tail queue counter. The CPU is the "current" CPU for a matching flow. The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry. Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length. When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table. And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted. When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true: - The current CPU is unset (equal to RPS_NO_CPU) - Current CPU is offline - The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table. This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry. This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery. Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality. 2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant. This patch implements RFS for TCP and connected UDP sockets. It should be usable for other flow oriented protocols. There are two configuration parameters for RFS. The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue. Both are rounded to power of two. The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency). The benefits of RFS are dependent on cache hierarchy, application load, and other factors. On simple benchmarks, we don't necessarily see improvement and sometimes see degradation. However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf. e1000e on 8 core Intel No RFS or RPS 104K tps at 30% CPU No RFS (best RPS config): 290K tps at 63% CPU RFS 303K tps at 61% CPU RPC test tps CPU% 50/90/99% usec latency Latency StdDev No RFS/RPS 103K 48% 757/900/3185 4472.35 RPS only: 174K 73% 415/993/2468 491.66 RFS 223K 73% 379/651/1382 315.61 Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Tom Herbert <therbert@google.com> 2010-04-16 19:01:27 -0400
committer: David S. Miller <davem@davemloft.net> 2010-04-16 19:01:27 -0400
commit: fec5e652e58fa6017b2c9e06466cb2a6538de5b4 (patch)
tree: e034f2a1e7930a0a225bd30896f834ec5e09c084 /net/core/net-sysfs.c
parent: b5d43998234331b9c01bd2165fdbb25115f4387f (diff)
1 files changed, 91 insertions, 3 deletions
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 96ed6905b823..143052a22b9b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -17,6 +17,7 @@
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 #include <linux/wireless.h>
+#include <linux/vmalloc.h>
 #include <net/wext.h>
 #include "net-sysfs.h"
@@ -601,22 +602,109 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
        return len;
 }
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+                                           struct rx_queue_attribute *attr,
+                                           char *buf)
+{
+        struct rps_dev_flow_table *flow_table;
+        unsigned int val = 0;
+        rcu_read_lock();
+        flow_table = rcu_dereference(queue->rps_flow_table);
+        if (flow_table)
+                val = flow_table->mask + 1;
+        rcu_read_unlock();
+        return sprintf(buf, "%u\n", val);
+}
+static void rps_dev_flow_table_release_work(struct work_struct *work)
+{
+        struct rps_dev_flow_table *table = container_of(work,
+            struct rps_dev_flow_table, free_work);
+        vfree(table);
+}
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+        struct rps_dev_flow_table *table = container_of(rcu,
+            struct rps_dev_flow_table, rcu);
+        INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
+        schedule_work(&table->free_work);
+}
+ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+                                     struct rx_queue_attribute *attr,
+                                     const char *buf, size_t len)
+{
+        unsigned int count;
+        char *endp;
+        struct rps_dev_flow_table *table, *old_table;
+        static DEFINE_SPINLOCK(rps_dev_flow_lock);
+        if (!capable(CAP_NET_ADMIN))
+                return -EPERM;
+        count = simple_strtoul(buf, &endp, 0);
+        if (endp == buf)
+                return -EINVAL;
+        if (count) {
+                int i;
+                if (count > 1<<30) {
+                        /* Enforce a limit to prevent overflow */
+                        return -EINVAL;
+                }
+                count = roundup_pow_of_two(count);
+                table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+                if (!table)
+                        return -ENOMEM;
+                table->mask = count - 1;
+                for (i = 0; i < count; i++)
+                        table->flows[i].cpu = RPS_NO_CPU;
+        } else
+                table = NULL;
+        spin_lock(&rps_dev_flow_lock);
+        old_table = queue->rps_flow_table;
+        rcu_assign_pointer(queue->rps_flow_table, table);
+        spin_unlock(&rps_dev_flow_lock);
+        if (old_table)
+                call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+        return len;
+}
 static struct rx_queue_attribute rps_cpus_attribute =
        __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+        __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+            show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
 static struct attribute *rx_queue_default_attrs[] = {
        &rps_cpus_attribute.attr,
+        &rps_dev_flow_table_cnt_attribute.attr,
        NULL
 };
 static void rx_queue_release(struct kobject *kobj)
 {
        struct netdev_rx_queue *queue = to_rx_queue(kobj);
-        struct rps_map *map = queue->rps_map;
        struct netdev_rx_queue *first = queue->first;
-        if (map)
+        if (queue->rps_map)
-                call_rcu(&map->rcu, rps_map_release);
+                call_rcu(&queue->rps_map->rcu, rps_map_release);
+        if (queue->rps_flow_table)
+                call_rcu(&queue->rps_flow_table->rcu,
+                    rps_dev_flow_table_release);
        if (atomic_dec_and_test(&first->count))
                kfree(first);
author	Tom Herbert <therbert@google.com>	2010-04-16 19:01:27 -0400
committer	David S. Miller <davem@davemloft.net>	2010-04-16 19:01:27 -0400
commit	fec5e652e58fa6017b2c9e06466cb2a6538de5b4 (patch)
tree	e034f2a1e7930a0a225bd30896f834ec5e09c084 /net/core/net-sysfs.c
parent	b5d43998234331b9c01bd2165fdbb25115f4387f (diff)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 96ed6905b823..143052a22b9b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c
@@ -17,6 +17,7 @@
17	#include <net/sock.h>	17	#include <net/sock.h>
18	#include <linux/rtnetlink.h>	18	#include <linux/rtnetlink.h>
19	#include <linux/wireless.h>	19	#include <linux/wireless.h>
		20	#include <linux/vmalloc.h>
20	#include <net/wext.h>	21	#include <net/wext.h>
21		22
22	#include "net-sysfs.h"	23	#include "net-sysfs.h"
@@ -601,22 +602,109 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
601	return len;	602	return len;
602	}	603	}
603		604
		605	static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
		606	struct rx_queue_attribute *attr,
		607	char *buf)
		608	{
		609	struct rps_dev_flow_table *flow_table;
		610	unsigned int val = 0;
		611
		612	rcu_read_lock();
		613	flow_table = rcu_dereference(queue->rps_flow_table);
		614	if (flow_table)
		615	val = flow_table->mask + 1;
		616	rcu_read_unlock();
		617
		618	return sprintf(buf, "%u\n", val);
		619	}
		620
		621	static void rps_dev_flow_table_release_work(struct work_struct *work)
		622	{
		623	struct rps_dev_flow_table *table = container_of(work,
		624	struct rps_dev_flow_table, free_work);
		625
		626	vfree(table);
		627	}
		628
		629	static void rps_dev_flow_table_release(struct rcu_head *rcu)
		630	{
		631	struct rps_dev_flow_table *table = container_of(rcu,
		632	struct rps_dev_flow_table, rcu);
		633
		634	INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
		635	schedule_work(&table->free_work);
		636	}
		637
		638	ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
		639	struct rx_queue_attribute *attr,
		640	const char *buf, size_t len)
		641	{
		642	unsigned int count;
		643	char *endp;
		644	struct rps_dev_flow_table table, old_table;
		645	static DEFINE_SPINLOCK(rps_dev_flow_lock);
		646
		647	if (!capable(CAP_NET_ADMIN))
		648	return -EPERM;
		649
		650	count = simple_strtoul(buf, &endp, 0);
		651	if (endp == buf)
		652	return -EINVAL;
		653
		654	if (count) {
		655	int i;
		656
		657	if (count > 1<<30) {
		658	/* Enforce a limit to prevent overflow */
		659	return -EINVAL;
		660	}
		661	count = roundup_pow_of_two(count);
		662	table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
		663	if (!table)
		664	return -ENOMEM;
		665
		666	table->mask = count - 1;
		667	for (i = 0; i < count; i++)
		668	table->flows[i].cpu = RPS_NO_CPU;
		669	} else
		670	table = NULL;
		671
		672	spin_lock(&rps_dev_flow_lock);
		673	old_table = queue->rps_flow_table;
		674	rcu_assign_pointer(queue->rps_flow_table, table);
		675	spin_unlock(&rps_dev_flow_lock);
		676
		677	if (old_table)
		678	call_rcu(&old_table->rcu, rps_dev_flow_table_release);
		679
		680	return len;
		681	}
		682
604	static struct rx_queue_attribute rps_cpus_attribute =	683	static struct rx_queue_attribute rps_cpus_attribute =
605	__ATTR(rps_cpus, S_IRUGO \| S_IWUSR, show_rps_map, store_rps_map);	684	__ATTR(rps_cpus, S_IRUGO \| S_IWUSR, show_rps_map, store_rps_map);
606		685
		686
		687	static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
		688	__ATTR(rps_flow_cnt, S_IRUGO \| S_IWUSR,
		689	show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
		690
607	static struct attribute *rx_queue_default_attrs[] = {	691	static struct attribute *rx_queue_default_attrs[] = {
608	&rps_cpus_attribute.attr,	692	&rps_cpus_attribute.attr,
		693	&rps_dev_flow_table_cnt_attribute.attr,
609	NULL	694	NULL
610	};	695	};
611		696
612	static void rx_queue_release(struct kobject *kobj)	697	static void rx_queue_release(struct kobject *kobj)
613	{	698	{
614	struct netdev_rx_queue *queue = to_rx_queue(kobj);	699	struct netdev_rx_queue *queue = to_rx_queue(kobj);
615	struct rps_map *map = queue->rps_map;
616	struct netdev_rx_queue *first = queue->first;	700	struct netdev_rx_queue *first = queue->first;
617		701
618	if (map)	702	if (queue->rps_map)
619	call_rcu(&map->rcu, rps_map_release);	703	call_rcu(&queue->rps_map->rcu, rps_map_release);
		704
		705	if (queue->rps_flow_table)
		706	call_rcu(&queue->rps_flow_table->rcu,
		707	rps_dev_flow_table_release);
620		708
621	if (atomic_dec_and_test(&first->count))	709	if (atomic_dec_and_test(&first->count))
622	kfree(first);	710	kfree(first);