diff options
author | Jason Wang <jasowang@redhat.com> | 2012-10-31 15:46:02 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-11-01 11:14:09 -0400 |
commit | 96442e42429e5f268ab97a3586c7694a3acc55a7 (patch) | |
tree | be875826c9239f2586f81c602077c7fe94bd131a | |
parent | cde8b15f1aabe327038ee4e0e11dd6b798572f69 (diff) |
tuntap: choose the txq based on rxq
This patch implements a simple multiqueue flow steering policy - tx follows rx
for tun/tap. The idea is simple, it just choose the txq based on which rxq it
comes. The flow were identified through the rxhash of a skb, and the hash to
queue mapping were recorded in a hlist with an ageing timer to retire the
mapping. The mapping were created when tun receives packet from userspace, and
was quired in .ndo_select_queue().
I run co-current TCP_CRR test and didn't see any mapping manipulation helpers in
perf top, so the overhead could be negelected.
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/tun.c | 227 |
1 files changed, 224 insertions, 3 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 79b6f9ecc12c..9e287680cd2e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c | |||
@@ -115,6 +115,8 @@ struct tap_filter { | |||
115 | */ | 115 | */ |
116 | #define MAX_TAP_QUEUES 1024 | 116 | #define MAX_TAP_QUEUES 1024 |
117 | 117 | ||
118 | #define TUN_FLOW_EXPIRE (3 * HZ) | ||
119 | |||
118 | /* A tun_file connects an open character device to a tuntap netdevice. It | 120 | /* A tun_file connects an open character device to a tuntap netdevice. It |
119 | * also contains all socket related strctures (except sock_fprog and tap_filter) | 121 | * also contains all socket related strctures (except sock_fprog and tap_filter) |
120 | * to serve as one transmit queue for tuntap device. The sock_fprog and | 122 | * to serve as one transmit queue for tuntap device. The sock_fprog and |
@@ -138,6 +140,18 @@ struct tun_file { | |||
138 | u16 queue_index; | 140 | u16 queue_index; |
139 | }; | 141 | }; |
140 | 142 | ||
143 | struct tun_flow_entry { | ||
144 | struct hlist_node hash_link; | ||
145 | struct rcu_head rcu; | ||
146 | struct tun_struct *tun; | ||
147 | |||
148 | u32 rxhash; | ||
149 | int queue_index; | ||
150 | unsigned long updated; | ||
151 | }; | ||
152 | |||
153 | #define TUN_NUM_FLOW_ENTRIES 1024 | ||
154 | |||
141 | /* Since the socket were moved to tun_file, to preserve the behavior of persist | 155 | /* Since the socket were moved to tun_file, to preserve the behavior of persist |
142 | * device, socket fileter, sndbuf and vnet header size were restore when the | 156 | * device, socket fileter, sndbuf and vnet header size were restore when the |
143 | * file were attached to a persist device. | 157 | * file were attached to a persist device. |
@@ -163,8 +177,164 @@ struct tun_struct { | |||
163 | #ifdef TUN_DEBUG | 177 | #ifdef TUN_DEBUG |
164 | int debug; | 178 | int debug; |
165 | #endif | 179 | #endif |
180 | spinlock_t lock; | ||
181 | struct kmem_cache *flow_cache; | ||
182 | struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; | ||
183 | struct timer_list flow_gc_timer; | ||
184 | unsigned long ageing_time; | ||
166 | }; | 185 | }; |
167 | 186 | ||
187 | static inline u32 tun_hashfn(u32 rxhash) | ||
188 | { | ||
189 | return rxhash & 0x3ff; | ||
190 | } | ||
191 | |||
192 | static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) | ||
193 | { | ||
194 | struct tun_flow_entry *e; | ||
195 | struct hlist_node *n; | ||
196 | |||
197 | hlist_for_each_entry_rcu(e, n, head, hash_link) { | ||
198 | if (e->rxhash == rxhash) | ||
199 | return e; | ||
200 | } | ||
201 | return NULL; | ||
202 | } | ||
203 | |||
204 | static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, | ||
205 | struct hlist_head *head, | ||
206 | u32 rxhash, u16 queue_index) | ||
207 | { | ||
208 | struct tun_flow_entry *e = kmem_cache_alloc(tun->flow_cache, | ||
209 | GFP_ATOMIC); | ||
210 | if (e) { | ||
211 | tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", | ||
212 | rxhash, queue_index); | ||
213 | e->updated = jiffies; | ||
214 | e->rxhash = rxhash; | ||
215 | e->queue_index = queue_index; | ||
216 | e->tun = tun; | ||
217 | hlist_add_head_rcu(&e->hash_link, head); | ||
218 | } | ||
219 | return e; | ||
220 | } | ||
221 | |||
222 | static void tun_flow_free(struct rcu_head *head) | ||
223 | { | ||
224 | struct tun_flow_entry *e | ||
225 | = container_of(head, struct tun_flow_entry, rcu); | ||
226 | kmem_cache_free(e->tun->flow_cache, e); | ||
227 | } | ||
228 | |||
229 | static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) | ||
230 | { | ||
231 | tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", | ||
232 | e->rxhash, e->queue_index); | ||
233 | hlist_del_rcu(&e->hash_link); | ||
234 | call_rcu(&e->rcu, tun_flow_free); | ||
235 | } | ||
236 | |||
237 | static void tun_flow_flush(struct tun_struct *tun) | ||
238 | { | ||
239 | int i; | ||
240 | |||
241 | spin_lock_bh(&tun->lock); | ||
242 | for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { | ||
243 | struct tun_flow_entry *e; | ||
244 | struct hlist_node *h, *n; | ||
245 | |||
246 | hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) | ||
247 | tun_flow_delete(tun, e); | ||
248 | } | ||
249 | spin_unlock_bh(&tun->lock); | ||
250 | } | ||
251 | |||
252 | static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) | ||
253 | { | ||
254 | int i; | ||
255 | |||
256 | spin_lock_bh(&tun->lock); | ||
257 | for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { | ||
258 | struct tun_flow_entry *e; | ||
259 | struct hlist_node *h, *n; | ||
260 | |||
261 | hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) { | ||
262 | if (e->queue_index == queue_index) | ||
263 | tun_flow_delete(tun, e); | ||
264 | } | ||
265 | } | ||
266 | spin_unlock_bh(&tun->lock); | ||
267 | } | ||
268 | |||
269 | static void tun_flow_cleanup(unsigned long data) | ||
270 | { | ||
271 | struct tun_struct *tun = (struct tun_struct *)data; | ||
272 | unsigned long delay = tun->ageing_time; | ||
273 | unsigned long next_timer = jiffies + delay; | ||
274 | unsigned long count = 0; | ||
275 | int i; | ||
276 | |||
277 | tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); | ||
278 | |||
279 | spin_lock_bh(&tun->lock); | ||
280 | for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { | ||
281 | struct tun_flow_entry *e; | ||
282 | struct hlist_node *h, *n; | ||
283 | |||
284 | hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) { | ||
285 | unsigned long this_timer; | ||
286 | count++; | ||
287 | this_timer = e->updated + delay; | ||
288 | if (time_before_eq(this_timer, jiffies)) | ||
289 | tun_flow_delete(tun, e); | ||
290 | else if (time_before(this_timer, next_timer)) | ||
291 | next_timer = this_timer; | ||
292 | } | ||
293 | } | ||
294 | |||
295 | if (count) | ||
296 | mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); | ||
297 | spin_unlock_bh(&tun->lock); | ||
298 | } | ||
299 | |||
300 | static void tun_flow_update(struct tun_struct *tun, struct sk_buff *skb, | ||
301 | u16 queue_index) | ||
302 | { | ||
303 | struct hlist_head *head; | ||
304 | struct tun_flow_entry *e; | ||
305 | unsigned long delay = tun->ageing_time; | ||
306 | u32 rxhash = skb_get_rxhash(skb); | ||
307 | |||
308 | if (!rxhash) | ||
309 | return; | ||
310 | else | ||
311 | head = &tun->flows[tun_hashfn(rxhash)]; | ||
312 | |||
313 | rcu_read_lock(); | ||
314 | |||
315 | if (tun->numqueues == 1) | ||
316 | goto unlock; | ||
317 | |||
318 | e = tun_flow_find(head, rxhash); | ||
319 | if (likely(e)) { | ||
320 | /* TODO: keep queueing to old queue until it's empty? */ | ||
321 | e->queue_index = queue_index; | ||
322 | e->updated = jiffies; | ||
323 | } else { | ||
324 | spin_lock_bh(&tun->lock); | ||
325 | if (!tun_flow_find(head, rxhash)) | ||
326 | tun_flow_create(tun, head, rxhash, queue_index); | ||
327 | |||
328 | if (!timer_pending(&tun->flow_gc_timer)) | ||
329 | mod_timer(&tun->flow_gc_timer, | ||
330 | round_jiffies_up(jiffies + delay)); | ||
331 | spin_unlock_bh(&tun->lock); | ||
332 | } | ||
333 | |||
334 | unlock: | ||
335 | rcu_read_unlock(); | ||
336 | } | ||
337 | |||
168 | /* We try to identify a flow through its rxhash first. The reason that | 338 | /* We try to identify a flow through its rxhash first. The reason that |
169 | * we do not check rxq no. is becuase some cards(e.g 82599), chooses | 339 | * we do not check rxq no. is becuase some cards(e.g 82599), chooses |
170 | * the rxq based on the txq where the last packet of the flow comes. As | 340 | * the rxq based on the txq where the last packet of the flow comes. As |
@@ -175,6 +345,7 @@ struct tun_struct { | |||
175 | static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) | 345 | static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) |
176 | { | 346 | { |
177 | struct tun_struct *tun = netdev_priv(dev); | 347 | struct tun_struct *tun = netdev_priv(dev); |
348 | struct tun_flow_entry *e; | ||
178 | u32 txq = 0; | 349 | u32 txq = 0; |
179 | u32 numqueues = 0; | 350 | u32 numqueues = 0; |
180 | 351 | ||
@@ -183,8 +354,12 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) | |||
183 | 354 | ||
184 | txq = skb_get_rxhash(skb); | 355 | txq = skb_get_rxhash(skb); |
185 | if (txq) { | 356 | if (txq) { |
186 | /* use multiply and shift instead of expensive divide */ | 357 | e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); |
187 | txq = ((u64)txq * numqueues) >> 32; | 358 | if (e) |
359 | txq = e->queue_index; | ||
360 | else | ||
361 | /* use multiply and shift instead of expensive divide */ | ||
362 | txq = ((u64)txq * numqueues) >> 32; | ||
188 | } else if (likely(skb_rx_queue_recorded(skb))) { | 363 | } else if (likely(skb_rx_queue_recorded(skb))) { |
189 | txq = skb_get_rx_queue(skb); | 364 | txq = skb_get_rx_queue(skb); |
190 | while (unlikely(txq >= numqueues)) | 365 | while (unlikely(txq >= numqueues)) |
@@ -234,6 +409,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) | |||
234 | sock_put(&tfile->sk); | 409 | sock_put(&tfile->sk); |
235 | 410 | ||
236 | synchronize_net(); | 411 | synchronize_net(); |
412 | tun_flow_delete_by_queue(tun, tun->numqueues + 1); | ||
237 | /* Drop read queue */ | 413 | /* Drop read queue */ |
238 | skb_queue_purge(&tfile->sk.sk_receive_queue); | 414 | skb_queue_purge(&tfile->sk.sk_receive_queue); |
239 | tun_set_real_num_queues(tun); | 415 | tun_set_real_num_queues(tun); |
@@ -631,6 +807,37 @@ static const struct net_device_ops tap_netdev_ops = { | |||
631 | #endif | 807 | #endif |
632 | }; | 808 | }; |
633 | 809 | ||
810 | static int tun_flow_init(struct tun_struct *tun) | ||
811 | { | ||
812 | int i; | ||
813 | |||
814 | tun->flow_cache = kmem_cache_create("tun_flow_cache", | ||
815 | sizeof(struct tun_flow_entry), 0, 0, | ||
816 | NULL); | ||
817 | if (!tun->flow_cache) | ||
818 | return -ENOMEM; | ||
819 | |||
820 | for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) | ||
821 | INIT_HLIST_HEAD(&tun->flows[i]); | ||
822 | |||
823 | tun->ageing_time = TUN_FLOW_EXPIRE; | ||
824 | setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); | ||
825 | mod_timer(&tun->flow_gc_timer, | ||
826 | round_jiffies_up(jiffies + tun->ageing_time)); | ||
827 | |||
828 | return 0; | ||
829 | } | ||
830 | |||
831 | static void tun_flow_uninit(struct tun_struct *tun) | ||
832 | { | ||
833 | del_timer_sync(&tun->flow_gc_timer); | ||
834 | tun_flow_flush(tun); | ||
835 | |||
836 | /* Wait for completion of call_rcu()'s */ | ||
837 | rcu_barrier(); | ||
838 | kmem_cache_destroy(tun->flow_cache); | ||
839 | } | ||
840 | |||
634 | /* Initialize net device. */ | 841 | /* Initialize net device. */ |
635 | static void tun_net_init(struct net_device *dev) | 842 | static void tun_net_init(struct net_device *dev) |
636 | { | 843 | { |
@@ -973,6 +1180,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, | |||
973 | tun->dev->stats.rx_packets++; | 1180 | tun->dev->stats.rx_packets++; |
974 | tun->dev->stats.rx_bytes += len; | 1181 | tun->dev->stats.rx_bytes += len; |
975 | 1182 | ||
1183 | tun_flow_update(tun, skb, tfile->queue_index); | ||
976 | return total_len; | 1184 | return total_len; |
977 | } | 1185 | } |
978 | 1186 | ||
@@ -1150,6 +1358,14 @@ out: | |||
1150 | return ret; | 1358 | return ret; |
1151 | } | 1359 | } |
1152 | 1360 | ||
1361 | static void tun_free_netdev(struct net_device *dev) | ||
1362 | { | ||
1363 | struct tun_struct *tun = netdev_priv(dev); | ||
1364 | |||
1365 | tun_flow_uninit(tun); | ||
1366 | free_netdev(dev); | ||
1367 | } | ||
1368 | |||
1153 | static void tun_setup(struct net_device *dev) | 1369 | static void tun_setup(struct net_device *dev) |
1154 | { | 1370 | { |
1155 | struct tun_struct *tun = netdev_priv(dev); | 1371 | struct tun_struct *tun = netdev_priv(dev); |
@@ -1158,7 +1374,7 @@ static void tun_setup(struct net_device *dev) | |||
1158 | tun->group = INVALID_GID; | 1374 | tun->group = INVALID_GID; |
1159 | 1375 | ||
1160 | dev->ethtool_ops = &tun_ethtool_ops; | 1376 | dev->ethtool_ops = &tun_ethtool_ops; |
1161 | dev->destructor = free_netdev; | 1377 | dev->destructor = tun_free_netdev; |
1162 | } | 1378 | } |
1163 | 1379 | ||
1164 | /* Trivial set of netlink ops to allow deleting tun or tap | 1380 | /* Trivial set of netlink ops to allow deleting tun or tap |
@@ -1381,10 +1597,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) | |||
1381 | tun->filter_attached = false; | 1597 | tun->filter_attached = false; |
1382 | tun->sndbuf = tfile->socket.sk->sk_sndbuf; | 1598 | tun->sndbuf = tfile->socket.sk->sk_sndbuf; |
1383 | 1599 | ||
1600 | spin_lock_init(&tun->lock); | ||
1601 | |||
1384 | security_tun_dev_post_create(&tfile->sk); | 1602 | security_tun_dev_post_create(&tfile->sk); |
1385 | 1603 | ||
1386 | tun_net_init(dev); | 1604 | tun_net_init(dev); |
1387 | 1605 | ||
1606 | if (tun_flow_init(tun)) | ||
1607 | goto err_free_dev; | ||
1608 | |||
1388 | dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | | 1609 | dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | |
1389 | TUN_USER_FEATURES; | 1610 | TUN_USER_FEATURES; |
1390 | dev->features = dev->hw_features; | 1611 | dev->features = dev->hw_features; |