openvswitch: Per NUMA node flow stats.

Keep kernel flow stats for each NUMA node rather than each (logical) CPU. This avoids using the per-CPU allocator and removes most of the kernel-side OVS locking overhead otherwise on the top of perf reports and allows OVS to scale better with higher number of threads. With 9 handlers and 4 revalidators netperf TCP_CRR test flow setup rate doubles on a server with two hyper-threaded physical CPUs (16 logical cores each) compared to the current OVS master. Tested with non-trivial flow table with a TCP port match rule forcing all new connections with unique port numbers to OVS userspace. The IP addresses are still wildcarded, so the kernel flows are not considered as exact match 5-tuple flows. This type of flows can be expected to appear in large numbers as the result of more effective wildcarding made possible by improvements in OVS userspace flow classifier. Perf results for this test (master): Events: 305K cycles + 8.43% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 5.64% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 4.75% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 3.32% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 2.61% ovs-vswitchd [kernel.kallsyms] [k] pcpu_alloc_area + 2.19% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.03% swapper [kernel.kallsyms] [k] intel_idle + 1.84% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 1.64% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.58% ovs-vswitchd libc-2.15.so [.] 0x7f4e6 + 1.07% ovs-vswitchd [kernel.kallsyms] [k] memset + 1.03% netperf [kernel.kallsyms] [k] __ticket_spin_lock + 0.92% swapper [kernel.kallsyms] [k] __ticket_spin_lock ... And after this patch: Events: 356K cycles + 6.85% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 4.63% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 3.06% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 2.81% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.51% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 2.27% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.84% ovs-vswitchd libc-2.15.so [.] 0x15d30f + 1.74% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 1.47% swapper [kernel.kallsyms] [k] intel_idle + 1.34% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask + 1.33% ovs-vswitchd ovs-vswitchd [.] rule_actions_unref + 1.16% ovs-vswitchd ovs-vswitchd [.] hindex_node_with_hash + 1.16% ovs-vswitchd ovs-vswitchd [.] do_xlate_actions + 1.09% ovs-vswitchd ovs-vswitchd [.] ofproto_rule_ref + 1.01% netperf [kernel.kallsyms] [k] __ticket_spin_lock ... There is a small increase in kernel spinlock overhead due to the same spinlock being shared between multiple cores of the same physical CPU, but that is barely visible in the netperf TCP_CRR test performance (maybe ~1% performance drop, hard to tell exactly due to variance in the test results), when testing for kernel module throughput (with no userspace activity, handful of kernel flows). On flow setup, a single stats instance is allocated (for the NUMA node 0). As CPUs from multiple NUMA nodes start updating stats, new NUMA-node specific stats instances are allocated. This allocation on the packet processing code path is made to never block or look for emergency memory pools, minimizing the allocation latency. If the allocation fails, the existing preallocated stats instance is used. Also, if only CPUs from one NUMA-node are updating the preallocated stats instance, no additional stats instances are allocated. This eliminates the need to pre-allocate stats instances that will not be used, also relieving the stats reader from the burden of reading stats that are never used. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com>
author: Jarno Rajahalme <jrajahalme@nicira.com> 2014-03-27 15:42:54 -0400
committer: Jesse Gross <jesse@nicira.com> 2014-05-16 16:40:29 -0400
commit: 63e7959c4b9bd6f791061c460a22d9ee32ae2240 (patch)
tree: 742342936b83ee5a6f8ee9d859588e3045a1b44c /net/openvswitch/flow_table.c
parent: 23dabf88abb48a866fdb19ee08ebcf1ddd9b1840 (diff)
1 files changed, 35 insertions, 11 deletions
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index aa92da23053d..d8ef37b937bd 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -48,6 +48,7 @@
 #define REHASH_INTERVAL         (10 * 60 * HZ)
 static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
 static u16 range_n_bytes(const struct sw_flow_key_range *range)
 {
@@ -75,7 +76,8 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
 struct sw_flow *ovs_flow_alloc(void)
 {
        struct sw_flow *flow;
-        int cpu;
+        struct flow_stats *stats;
+        int node;
        flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
        if (!flow)
@@ -83,17 +85,22 @@ struct sw_flow *ovs_flow_alloc(void)
        flow->sf_acts = NULL;
        flow->mask = NULL;
+        flow->stats_last_writer = NUMA_NO_NODE;
-        flow->stats = alloc_percpu(struct flow_stats);
+        /* Initialize the default stat node. */
-        if (!flow->stats)
+        stats = kmem_cache_alloc_node(flow_stats_cache,
+                                      GFP_KERNEL | __GFP_ZERO, 0);
+        if (!stats)
                goto err;
-        for_each_possible_cpu(cpu) {
+        spin_lock_init(&stats->lock);
-                struct flow_stats *cpu_stats;
+        RCU_INIT_POINTER(flow->stats[0], stats);
+        for_each_node(node)
+                if (node != 0)
+                        RCU_INIT_POINTER(flow->stats[node], NULL);
-                cpu_stats = per_cpu_ptr(flow->stats, cpu);
-                spin_lock_init(&cpu_stats->lock);
-        }
        return flow;
 err:
        kmem_cache_free(flow_cache, flow);
@@ -130,8 +137,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 static void flow_free(struct sw_flow *flow)
 {
+        int node;
        kfree((struct sf_flow_acts __force *)flow->sf_acts);
-        free_percpu(flow->stats);
+        for_each_node(node)
+                if (flow->stats[node])
+                        kmem_cache_free(flow_stats_cache,
+                                        (struct flow_stats __force *)flow->stats[node]);
        kmem_cache_free(flow_cache, flow);
 }
@@ -586,16 +598,28 @@ int ovs_flow_init(void)
        BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
        BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
-        flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
+        flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
-                                        0, NULL);
+                                       + (num_possible_nodes()
+                                          * sizeof(struct flow_stats *)),
+                                       0, 0, NULL);
        if (flow_cache == NULL)
                return -ENOMEM;
+        flow_stats_cache
+                = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+                                    0, SLAB_HWCACHE_ALIGN, NULL);
+        if (flow_stats_cache == NULL) {
+                kmem_cache_destroy(flow_cache);
+                flow_cache = NULL;
+                return -ENOMEM;
+        }
        return 0;
 }
 /* Uninitializes the flow module. */
 void ovs_flow_exit(void)
 {
+        kmem_cache_destroy(flow_stats_cache);
        kmem_cache_destroy(flow_cache);
 }
author	Jarno Rajahalme <jrajahalme@nicira.com>	2014-03-27 15:42:54 -0400
committer	Jesse Gross <jesse@nicira.com>	2014-05-16 16:40:29 -0400
commit	63e7959c4b9bd6f791061c460a22d9ee32ae2240 (patch)
tree	742342936b83ee5a6f8ee9d859588e3045a1b44c /net/openvswitch/flow_table.c
parent	23dabf88abb48a866fdb19ee08ebcf1ddd9b1840 (diff)

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index aa92da23053d..d8ef37b937bd 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c
@@ -48,6 +48,7 @@
48	#define REHASH_INTERVAL (10 * 60 * HZ)	48	#define REHASH_INTERVAL (10 * 60 * HZ)
49		49
50	static struct kmem_cache *flow_cache;	50	static struct kmem_cache *flow_cache;
		51	struct kmem_cache *flow_stats_cache __read_mostly;
51		52
52	static u16 range_n_bytes(const struct sw_flow_key_range *range)	53	static u16 range_n_bytes(const struct sw_flow_key_range *range)
53	{	54	{
@@ -75,7 +76,8 @@ void ovs_flow_mask_key(struct sw_flow_key dst, const struct sw_flow_key src,
75	struct sw_flow *ovs_flow_alloc(void)	76	struct sw_flow *ovs_flow_alloc(void)
76	{	77	{
77	struct sw_flow *flow;	78	struct sw_flow *flow;
78	int cpu;	79	struct flow_stats *stats;
		80	int node;
79		81
80	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);	82	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
81	if (!flow)	83	if (!flow)
@@ -83,17 +85,22 @@ struct sw_flow *ovs_flow_alloc(void)
83		85
84	flow->sf_acts = NULL;	86	flow->sf_acts = NULL;
85	flow->mask = NULL;	87	flow->mask = NULL;
		88	flow->stats_last_writer = NUMA_NO_NODE;
86		89
87	flow->stats = alloc_percpu(struct flow_stats);	90	/* Initialize the default stat node. */
88	if (!flow->stats)	91	stats = kmem_cache_alloc_node(flow_stats_cache,
		92	GFP_KERNEL \| __GFP_ZERO, 0);
		93	if (!stats)
89	goto err;	94	goto err;
90		95
91	for_each_possible_cpu(cpu) {	96	spin_lock_init(&stats->lock);
92	struct flow_stats *cpu_stats;	97
		98	RCU_INIT_POINTER(flow->stats[0], stats);
		99
		100	for_each_node(node)
		101	if (node != 0)
		102	RCU_INIT_POINTER(flow->stats[node], NULL);
93		103
94	cpu_stats = per_cpu_ptr(flow->stats, cpu);
95	spin_lock_init(&cpu_stats->lock);
96	}
97	return flow;	104	return flow;
98	err:	105	err:
99	kmem_cache_free(flow_cache, flow);	106	kmem_cache_free(flow_cache, flow);
@@ -130,8 +137,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
130		137
131	static void flow_free(struct sw_flow *flow)	138	static void flow_free(struct sw_flow *flow)
132	{	139	{
		140	int node;
		141
133	kfree((struct sf_flow_acts __force *)flow->sf_acts);	142	kfree((struct sf_flow_acts __force *)flow->sf_acts);
134	free_percpu(flow->stats);	143	for_each_node(node)
		144	if (flow->stats[node])
		145	kmem_cache_free(flow_stats_cache,
		146	(struct flow_stats __force *)flow->stats[node]);
135	kmem_cache_free(flow_cache, flow);	147	kmem_cache_free(flow_cache, flow);
136	}	148	}
137		149
@@ -586,16 +598,28 @@ int ovs_flow_init(void)
586	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));	598	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
587	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));	599	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
588		600
589	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,	601	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
590	0, NULL);	602	+ (num_possible_nodes()
		603	* sizeof(struct flow_stats *)),
		604	0, 0, NULL);
591	if (flow_cache == NULL)	605	if (flow_cache == NULL)
592	return -ENOMEM;	606	return -ENOMEM;
593		607
		608	flow_stats_cache
		609	= kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
		610	0, SLAB_HWCACHE_ALIGN, NULL);
		611	if (flow_stats_cache == NULL) {
		612	kmem_cache_destroy(flow_cache);
		613	flow_cache = NULL;
		614	return -ENOMEM;
		615	}
		616
594	return 0;	617	return 0;
595	}	618	}
596		619
597	/* Uninitializes the flow module. */	620	/* Uninitializes the flow module. */
598	void ovs_flow_exit(void)	621	void ovs_flow_exit(void)
599	{	622	{
		623	kmem_cache_destroy(flow_stats_cache);
600	kmem_cache_destroy(flow_cache);	624	kmem_cache_destroy(flow_cache);
601	}	625	}