aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch/flow_table.c
diff options
context:
space:
mode:
authorJarno Rajahalme <jrajahalme@nicira.com>2014-03-27 15:42:54 -0400
committerJesse Gross <jesse@nicira.com>2014-05-16 16:40:29 -0400
commit63e7959c4b9bd6f791061c460a22d9ee32ae2240 (patch)
tree742342936b83ee5a6f8ee9d859588e3045a1b44c /net/openvswitch/flow_table.c
parent23dabf88abb48a866fdb19ee08ebcf1ddd9b1840 (diff)
openvswitch: Per NUMA node flow stats.
Keep kernel flow stats for each NUMA node rather than each (logical) CPU. This avoids using the per-CPU allocator and removes most of the kernel-side OVS locking overhead otherwise on the top of perf reports and allows OVS to scale better with higher number of threads. With 9 handlers and 4 revalidators netperf TCP_CRR test flow setup rate doubles on a server with two hyper-threaded physical CPUs (16 logical cores each) compared to the current OVS master. Tested with non-trivial flow table with a TCP port match rule forcing all new connections with unique port numbers to OVS userspace. The IP addresses are still wildcarded, so the kernel flows are not considered as exact match 5-tuple flows. This type of flows can be expected to appear in large numbers as the result of more effective wildcarding made possible by improvements in OVS userspace flow classifier. Perf results for this test (master): Events: 305K cycles + 8.43% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 5.64% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 4.75% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 3.32% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 2.61% ovs-vswitchd [kernel.kallsyms] [k] pcpu_alloc_area + 2.19% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.03% swapper [kernel.kallsyms] [k] intel_idle + 1.84% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 1.64% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.58% ovs-vswitchd libc-2.15.so [.] 0x7f4e6 + 1.07% ovs-vswitchd [kernel.kallsyms] [k] memset + 1.03% netperf [kernel.kallsyms] [k] __ticket_spin_lock + 0.92% swapper [kernel.kallsyms] [k] __ticket_spin_lock ... And after this patch: Events: 356K cycles + 6.85% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 4.63% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 3.06% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 2.81% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.51% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 2.27% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.84% ovs-vswitchd libc-2.15.so [.] 0x15d30f + 1.74% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 1.47% swapper [kernel.kallsyms] [k] intel_idle + 1.34% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask + 1.33% ovs-vswitchd ovs-vswitchd [.] rule_actions_unref + 1.16% ovs-vswitchd ovs-vswitchd [.] hindex_node_with_hash + 1.16% ovs-vswitchd ovs-vswitchd [.] do_xlate_actions + 1.09% ovs-vswitchd ovs-vswitchd [.] ofproto_rule_ref + 1.01% netperf [kernel.kallsyms] [k] __ticket_spin_lock ... There is a small increase in kernel spinlock overhead due to the same spinlock being shared between multiple cores of the same physical CPU, but that is barely visible in the netperf TCP_CRR test performance (maybe ~1% performance drop, hard to tell exactly due to variance in the test results), when testing for kernel module throughput (with no userspace activity, handful of kernel flows). On flow setup, a single stats instance is allocated (for the NUMA node 0). As CPUs from multiple NUMA nodes start updating stats, new NUMA-node specific stats instances are allocated. This allocation on the packet processing code path is made to never block or look for emergency memory pools, minimizing the allocation latency. If the allocation fails, the existing preallocated stats instance is used. Also, if only CPUs from one NUMA-node are updating the preallocated stats instance, no additional stats instances are allocated. This eliminates the need to pre-allocate stats instances that will not be used, also relieving the stats reader from the burden of reading stats that are never used. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com>
Diffstat (limited to 'net/openvswitch/flow_table.c')
-rw-r--r--net/openvswitch/flow_table.c46
1 files changed, 35 insertions, 11 deletions
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index aa92da23053d..d8ef37b937bd 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -48,6 +48,7 @@
48#define REHASH_INTERVAL (10 * 60 * HZ) 48#define REHASH_INTERVAL (10 * 60 * HZ)
49 49
50static struct kmem_cache *flow_cache; 50static struct kmem_cache *flow_cache;
51struct kmem_cache *flow_stats_cache __read_mostly;
51 52
52static u16 range_n_bytes(const struct sw_flow_key_range *range) 53static u16 range_n_bytes(const struct sw_flow_key_range *range)
53{ 54{
@@ -75,7 +76,8 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
75struct sw_flow *ovs_flow_alloc(void) 76struct sw_flow *ovs_flow_alloc(void)
76{ 77{
77 struct sw_flow *flow; 78 struct sw_flow *flow;
78 int cpu; 79 struct flow_stats *stats;
80 int node;
79 81
80 flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); 82 flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
81 if (!flow) 83 if (!flow)
@@ -83,17 +85,22 @@ struct sw_flow *ovs_flow_alloc(void)
83 85
84 flow->sf_acts = NULL; 86 flow->sf_acts = NULL;
85 flow->mask = NULL; 87 flow->mask = NULL;
88 flow->stats_last_writer = NUMA_NO_NODE;
86 89
87 flow->stats = alloc_percpu(struct flow_stats); 90 /* Initialize the default stat node. */
88 if (!flow->stats) 91 stats = kmem_cache_alloc_node(flow_stats_cache,
92 GFP_KERNEL | __GFP_ZERO, 0);
93 if (!stats)
89 goto err; 94 goto err;
90 95
91 for_each_possible_cpu(cpu) { 96 spin_lock_init(&stats->lock);
92 struct flow_stats *cpu_stats; 97
98 RCU_INIT_POINTER(flow->stats[0], stats);
99
100 for_each_node(node)
101 if (node != 0)
102 RCU_INIT_POINTER(flow->stats[node], NULL);
93 103
94 cpu_stats = per_cpu_ptr(flow->stats, cpu);
95 spin_lock_init(&cpu_stats->lock);
96 }
97 return flow; 104 return flow;
98err: 105err:
99 kmem_cache_free(flow_cache, flow); 106 kmem_cache_free(flow_cache, flow);
@@ -130,8 +137,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
130 137
131static void flow_free(struct sw_flow *flow) 138static void flow_free(struct sw_flow *flow)
132{ 139{
140 int node;
141
133 kfree((struct sf_flow_acts __force *)flow->sf_acts); 142 kfree((struct sf_flow_acts __force *)flow->sf_acts);
134 free_percpu(flow->stats); 143 for_each_node(node)
144 if (flow->stats[node])
145 kmem_cache_free(flow_stats_cache,
146 (struct flow_stats __force *)flow->stats[node]);
135 kmem_cache_free(flow_cache, flow); 147 kmem_cache_free(flow_cache, flow);
136} 148}
137 149
@@ -586,16 +598,28 @@ int ovs_flow_init(void)
586 BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); 598 BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
587 BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); 599 BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
588 600
589 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, 601 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
590 0, NULL); 602 + (num_possible_nodes()
603 * sizeof(struct flow_stats *)),
604 0, 0, NULL);
591 if (flow_cache == NULL) 605 if (flow_cache == NULL)
592 return -ENOMEM; 606 return -ENOMEM;
593 607
608 flow_stats_cache
609 = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
610 0, SLAB_HWCACHE_ALIGN, NULL);
611 if (flow_stats_cache == NULL) {
612 kmem_cache_destroy(flow_cache);
613 flow_cache = NULL;
614 return -ENOMEM;
615 }
616
594 return 0; 617 return 0;
595} 618}
596 619
597/* Uninitializes the flow module. */ 620/* Uninitializes the flow module. */
598void ovs_flow_exit(void) 621void ovs_flow_exit(void)
599{ 622{
623 kmem_cache_destroy(flow_stats_cache);
600 kmem_cache_destroy(flow_cache); 624 kmem_cache_destroy(flow_cache);
601} 625}