aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch
diff options
context:
space:
mode:
authorJarno Rajahalme <jrajahalme@nicira.com>2014-03-27 15:42:54 -0400
committerJesse Gross <jesse@nicira.com>2014-05-16 16:40:29 -0400
commit63e7959c4b9bd6f791061c460a22d9ee32ae2240 (patch)
tree742342936b83ee5a6f8ee9d859588e3045a1b44c /net/openvswitch
parent23dabf88abb48a866fdb19ee08ebcf1ddd9b1840 (diff)
openvswitch: Per NUMA node flow stats.
Keep kernel flow stats for each NUMA node rather than each (logical) CPU. This avoids using the per-CPU allocator and removes most of the kernel-side OVS locking overhead otherwise on the top of perf reports and allows OVS to scale better with higher number of threads. With 9 handlers and 4 revalidators netperf TCP_CRR test flow setup rate doubles on a server with two hyper-threaded physical CPUs (16 logical cores each) compared to the current OVS master. Tested with non-trivial flow table with a TCP port match rule forcing all new connections with unique port numbers to OVS userspace. The IP addresses are still wildcarded, so the kernel flows are not considered as exact match 5-tuple flows. This type of flows can be expected to appear in large numbers as the result of more effective wildcarding made possible by improvements in OVS userspace flow classifier. Perf results for this test (master): Events: 305K cycles + 8.43% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 5.64% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 4.75% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 3.32% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 2.61% ovs-vswitchd [kernel.kallsyms] [k] pcpu_alloc_area + 2.19% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.03% swapper [kernel.kallsyms] [k] intel_idle + 1.84% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 1.64% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.58% ovs-vswitchd libc-2.15.so [.] 0x7f4e6 + 1.07% ovs-vswitchd [kernel.kallsyms] [k] memset + 1.03% netperf [kernel.kallsyms] [k] __ticket_spin_lock + 0.92% swapper [kernel.kallsyms] [k] __ticket_spin_lock ... And after this patch: Events: 356K cycles + 6.85% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 4.63% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 3.06% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 2.81% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.51% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 2.27% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.84% ovs-vswitchd libc-2.15.so [.] 0x15d30f + 1.74% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 1.47% swapper [kernel.kallsyms] [k] intel_idle + 1.34% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask + 1.33% ovs-vswitchd ovs-vswitchd [.] rule_actions_unref + 1.16% ovs-vswitchd ovs-vswitchd [.] hindex_node_with_hash + 1.16% ovs-vswitchd ovs-vswitchd [.] do_xlate_actions + 1.09% ovs-vswitchd ovs-vswitchd [.] ofproto_rule_ref + 1.01% netperf [kernel.kallsyms] [k] __ticket_spin_lock ... There is a small increase in kernel spinlock overhead due to the same spinlock being shared between multiple cores of the same physical CPU, but that is barely visible in the netperf TCP_CRR test performance (maybe ~1% performance drop, hard to tell exactly due to variance in the test results), when testing for kernel module throughput (with no userspace activity, handful of kernel flows). On flow setup, a single stats instance is allocated (for the NUMA node 0). As CPUs from multiple NUMA nodes start updating stats, new NUMA-node specific stats instances are allocated. This allocation on the packet processing code path is made to never block or look for emergency memory pools, minimizing the allocation latency. If the allocation fails, the existing preallocated stats instance is used. Also, if only CPUs from one NUMA-node are updating the preallocated stats instance, no additional stats instances are allocated. This eliminates the need to pre-allocate stats instances that will not be used, also relieving the stats reader from the burden of reading stats that are never used. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com>
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/flow.c119
-rw-r--r--net/openvswitch/flow.h10
-rw-r--r--net/openvswitch/flow_table.c46
-rw-r--r--net/openvswitch/flow_table.h2
4 files changed, 122 insertions, 55 deletions
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index aad7a8da70b1..432f04d5c896 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -65,8 +65,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
65{ 65{
66 struct flow_stats *stats; 66 struct flow_stats *stats;
67 __be16 tcp_flags = 0; 67 __be16 tcp_flags = 0;
68 int node = numa_node_id();
68 69
69 stats = this_cpu_ptr(flow->stats); 70 stats = rcu_dereference(flow->stats[node]);
70 71
71 if ((flow->key.eth.type == htons(ETH_P_IP) || 72 if ((flow->key.eth.type == htons(ETH_P_IP) ||
72 flow->key.eth.type == htons(ETH_P_IPV6)) && 73 flow->key.eth.type == htons(ETH_P_IPV6)) &&
@@ -76,68 +77,102 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
76 tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb)); 77 tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
77 } 78 }
78 79
79 spin_lock(&stats->lock); 80 /* Check if already have node-specific stats. */
81 if (likely(stats)) {
82 spin_lock(&stats->lock);
83 /* Mark if we write on the pre-allocated stats. */
84 if (node == 0 && unlikely(flow->stats_last_writer != node))
85 flow->stats_last_writer = node;
86 } else {
87 stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
88 spin_lock(&stats->lock);
89
90 /* If the current NUMA-node is the only writer on the
91 * pre-allocated stats keep using them.
92 */
93 if (unlikely(flow->stats_last_writer != node)) {
94 /* A previous locker may have already allocated the
95 * stats, so we need to check again. If node-specific
96 * stats were already allocated, we update the pre-
97 * allocated stats as we have already locked them.
98 */
99 if (likely(flow->stats_last_writer != NUMA_NO_NODE)
100 && likely(!rcu_dereference(flow->stats[node]))) {
101 /* Try to allocate node-specific stats. */
102 struct flow_stats *new_stats;
103
104 new_stats =
105 kmem_cache_alloc_node(flow_stats_cache,
106 GFP_THISNODE |
107 __GFP_NOMEMALLOC,
108 node);
109 if (likely(new_stats)) {
110 new_stats->used = jiffies;
111 new_stats->packet_count = 1;
112 new_stats->byte_count = skb->len;
113 new_stats->tcp_flags = tcp_flags;
114 spin_lock_init(&new_stats->lock);
115
116 rcu_assign_pointer(flow->stats[node],
117 new_stats);
118 goto unlock;
119 }
120 }
121 flow->stats_last_writer = node;
122 }
123 }
124
80 stats->used = jiffies; 125 stats->used = jiffies;
81 stats->packet_count++; 126 stats->packet_count++;
82 stats->byte_count += skb->len; 127 stats->byte_count += skb->len;
83 stats->tcp_flags |= tcp_flags; 128 stats->tcp_flags |= tcp_flags;
84 spin_unlock(&stats->lock); 129unlock:
85}
86
87static void stats_read(struct flow_stats *stats,
88 struct ovs_flow_stats *ovs_stats,
89 unsigned long *used, __be16 *tcp_flags)
90{
91 spin_lock(&stats->lock);
92 if (!*used || time_after(stats->used, *used))
93 *used = stats->used;
94 *tcp_flags |= stats->tcp_flags;
95 ovs_stats->n_packets += stats->packet_count;
96 ovs_stats->n_bytes += stats->byte_count;
97 spin_unlock(&stats->lock); 130 spin_unlock(&stats->lock);
98} 131}
99 132
100void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, 133void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
101 unsigned long *used, __be16 *tcp_flags) 134 unsigned long *used, __be16 *tcp_flags)
102{ 135{
103 int cpu; 136 int node;
104 137
105 *used = 0; 138 *used = 0;
106 *tcp_flags = 0; 139 *tcp_flags = 0;
107 memset(ovs_stats, 0, sizeof(*ovs_stats)); 140 memset(ovs_stats, 0, sizeof(*ovs_stats));
108 141
109 local_bh_disable(); 142 for_each_node(node) {
110 143 struct flow_stats *stats = rcu_dereference(flow->stats[node]);
111 for_each_possible_cpu(cpu) {
112 struct flow_stats *stats;
113 144
114 stats = per_cpu_ptr(flow->stats.cpu_stats, cpu); 145 if (stats) {
115 stats_read(stats, ovs_stats, used, tcp_flags); 146 /* Local CPU may write on non-local stats, so we must
147 * block bottom-halves here.
148 */
149 spin_lock_bh(&stats->lock);
150 if (!*used || time_after(stats->used, *used))
151 *used = stats->used;
152 *tcp_flags |= stats->tcp_flags;
153 ovs_stats->n_packets += stats->packet_count;
154 ovs_stats->n_bytes += stats->byte_count;
155 spin_unlock_bh(&stats->lock);
156 }
116 } 157 }
117
118 local_bh_enable();
119}
120
121static void stats_reset(struct flow_stats *stats)
122{
123 spin_lock(&stats->lock);
124 stats->used = 0;
125 stats->packet_count = 0;
126 stats->byte_count = 0;
127 stats->tcp_flags = 0;
128 spin_unlock(&stats->lock);
129} 158}
130 159
131void ovs_flow_stats_clear(struct sw_flow *flow) 160void ovs_flow_stats_clear(struct sw_flow *flow)
132{ 161{
133 int cpu; 162 int node;
134 163
135 local_bh_disable(); 164 for_each_node(node) {
136 165 struct flow_stats *stats = rcu_dereference(flow->stats[node]);
137 for_each_possible_cpu(cpu) 166
138 stats_reset(per_cpu_ptr(flow->stats, cpu)); 167 if (stats) {
139 168 spin_lock_bh(&stats->lock);
140 local_bh_enable(); 169 stats->used = 0;
170 stats->packet_count = 0;
171 stats->byte_count = 0;
172 stats->tcp_flags = 0;
173 spin_unlock_bh(&stats->lock);
174 }
175 }
141} 176}
142 177
143static int check_header(struct sk_buff *skb, int len) 178static int check_header(struct sk_buff *skb, int len)
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 9c0dd8aa3117..ddcebc53224f 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -159,12 +159,18 @@ struct sw_flow {
159 struct rcu_head rcu; 159 struct rcu_head rcu;
160 struct hlist_node hash_node[2]; 160 struct hlist_node hash_node[2];
161 u32 hash; 161 u32 hash;
162 162 int stats_last_writer; /* NUMA-node id of the last writer on
163 * 'stats[0]'.
164 */
163 struct sw_flow_key key; 165 struct sw_flow_key key;
164 struct sw_flow_key unmasked_key; 166 struct sw_flow_key unmasked_key;
165 struct sw_flow_mask *mask; 167 struct sw_flow_mask *mask;
166 struct sw_flow_actions __rcu *sf_acts; 168 struct sw_flow_actions __rcu *sf_acts;
167 struct flow_stats __percpu *stats; 169 struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one
170 * is allocated at flow creation time,
171 * the rest are allocated on demand
172 * while holding the 'stats[0].lock'.
173 */
168}; 174};
169 175
170struct arp_eth_header { 176struct arp_eth_header {
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index aa92da23053d..d8ef37b937bd 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -48,6 +48,7 @@
48#define REHASH_INTERVAL (10 * 60 * HZ) 48#define REHASH_INTERVAL (10 * 60 * HZ)
49 49
50static struct kmem_cache *flow_cache; 50static struct kmem_cache *flow_cache;
51struct kmem_cache *flow_stats_cache __read_mostly;
51 52
52static u16 range_n_bytes(const struct sw_flow_key_range *range) 53static u16 range_n_bytes(const struct sw_flow_key_range *range)
53{ 54{
@@ -75,7 +76,8 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
75struct sw_flow *ovs_flow_alloc(void) 76struct sw_flow *ovs_flow_alloc(void)
76{ 77{
77 struct sw_flow *flow; 78 struct sw_flow *flow;
78 int cpu; 79 struct flow_stats *stats;
80 int node;
79 81
80 flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); 82 flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
81 if (!flow) 83 if (!flow)
@@ -83,17 +85,22 @@ struct sw_flow *ovs_flow_alloc(void)
83 85
84 flow->sf_acts = NULL; 86 flow->sf_acts = NULL;
85 flow->mask = NULL; 87 flow->mask = NULL;
88 flow->stats_last_writer = NUMA_NO_NODE;
86 89
87 flow->stats = alloc_percpu(struct flow_stats); 90 /* Initialize the default stat node. */
88 if (!flow->stats) 91 stats = kmem_cache_alloc_node(flow_stats_cache,
92 GFP_KERNEL | __GFP_ZERO, 0);
93 if (!stats)
89 goto err; 94 goto err;
90 95
91 for_each_possible_cpu(cpu) { 96 spin_lock_init(&stats->lock);
92 struct flow_stats *cpu_stats; 97
98 RCU_INIT_POINTER(flow->stats[0], stats);
99
100 for_each_node(node)
101 if (node != 0)
102 RCU_INIT_POINTER(flow->stats[node], NULL);
93 103
94 cpu_stats = per_cpu_ptr(flow->stats, cpu);
95 spin_lock_init(&cpu_stats->lock);
96 }
97 return flow; 104 return flow;
98err: 105err:
99 kmem_cache_free(flow_cache, flow); 106 kmem_cache_free(flow_cache, flow);
@@ -130,8 +137,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
130 137
131static void flow_free(struct sw_flow *flow) 138static void flow_free(struct sw_flow *flow)
132{ 139{
140 int node;
141
133 kfree((struct sf_flow_acts __force *)flow->sf_acts); 142 kfree((struct sf_flow_acts __force *)flow->sf_acts);
134 free_percpu(flow->stats); 143 for_each_node(node)
144 if (flow->stats[node])
145 kmem_cache_free(flow_stats_cache,
146 (struct flow_stats __force *)flow->stats[node]);
135 kmem_cache_free(flow_cache, flow); 147 kmem_cache_free(flow_cache, flow);
136} 148}
137 149
@@ -586,16 +598,28 @@ int ovs_flow_init(void)
586 BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); 598 BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
587 BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); 599 BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
588 600
589 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, 601 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
590 0, NULL); 602 + (num_possible_nodes()
603 * sizeof(struct flow_stats *)),
604 0, 0, NULL);
591 if (flow_cache == NULL) 605 if (flow_cache == NULL)
592 return -ENOMEM; 606 return -ENOMEM;
593 607
608 flow_stats_cache
609 = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
610 0, SLAB_HWCACHE_ALIGN, NULL);
611 if (flow_stats_cache == NULL) {
612 kmem_cache_destroy(flow_cache);
613 flow_cache = NULL;
614 return -ENOMEM;
615 }
616
594 return 0; 617 return 0;
595} 618}
596 619
597/* Uninitializes the flow module. */ 620/* Uninitializes the flow module. */
598void ovs_flow_exit(void) 621void ovs_flow_exit(void)
599{ 622{
623 kmem_cache_destroy(flow_stats_cache);
600 kmem_cache_destroy(flow_cache); 624 kmem_cache_destroy(flow_cache);
601} 625}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index c26c59a7ab57..ca8a5820f615 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -52,6 +52,8 @@ struct flow_table {
52 unsigned int count; 52 unsigned int count;
53}; 53};
54 54
55extern struct kmem_cache *flow_stats_cache;
56
55int ovs_flow_init(void); 57int ovs_flow_init(void);
56void ovs_flow_exit(void); 58void ovs_flow_exit(void);
57 59