aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Wang <jasowang@redhat.com>2012-10-31 15:46:02 -0400
committerDavid S. Miller <davem@davemloft.net>2012-11-01 11:14:09 -0400
commit96442e42429e5f268ab97a3586c7694a3acc55a7 (patch)
treebe875826c9239f2586f81c602077c7fe94bd131a
parentcde8b15f1aabe327038ee4e0e11dd6b798572f69 (diff)
tuntap: choose the txq based on rxq
This patch implements a simple multiqueue flow steering policy - tx follows rx for tun/tap. The idea is simple, it just choose the txq based on which rxq it comes. The flow were identified through the rxhash of a skb, and the hash to queue mapping were recorded in a hlist with an ageing timer to retire the mapping. The mapping were created when tun receives packet from userspace, and was quired in .ndo_select_queue(). I run co-current TCP_CRR test and didn't see any mapping manipulation helpers in perf top, so the overhead could be negelected. Signed-off-by: Jason Wang <jasowang@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/tun.c227
1 files changed, 224 insertions, 3 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 79b6f9ecc12c..9e287680cd2e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -115,6 +115,8 @@ struct tap_filter {
115 */ 115 */
116#define MAX_TAP_QUEUES 1024 116#define MAX_TAP_QUEUES 1024
117 117
118#define TUN_FLOW_EXPIRE (3 * HZ)
119
118/* A tun_file connects an open character device to a tuntap netdevice. It 120/* A tun_file connects an open character device to a tuntap netdevice. It
119 * also contains all socket related strctures (except sock_fprog and tap_filter) 121 * also contains all socket related strctures (except sock_fprog and tap_filter)
120 * to serve as one transmit queue for tuntap device. The sock_fprog and 122 * to serve as one transmit queue for tuntap device. The sock_fprog and
@@ -138,6 +140,18 @@ struct tun_file {
138 u16 queue_index; 140 u16 queue_index;
139}; 141};
140 142
143struct tun_flow_entry {
144 struct hlist_node hash_link;
145 struct rcu_head rcu;
146 struct tun_struct *tun;
147
148 u32 rxhash;
149 int queue_index;
150 unsigned long updated;
151};
152
153#define TUN_NUM_FLOW_ENTRIES 1024
154
141/* Since the socket were moved to tun_file, to preserve the behavior of persist 155/* Since the socket were moved to tun_file, to preserve the behavior of persist
142 * device, socket fileter, sndbuf and vnet header size were restore when the 156 * device, socket fileter, sndbuf and vnet header size were restore when the
143 * file were attached to a persist device. 157 * file were attached to a persist device.
@@ -163,8 +177,164 @@ struct tun_struct {
163#ifdef TUN_DEBUG 177#ifdef TUN_DEBUG
164 int debug; 178 int debug;
165#endif 179#endif
180 spinlock_t lock;
181 struct kmem_cache *flow_cache;
182 struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
183 struct timer_list flow_gc_timer;
184 unsigned long ageing_time;
166}; 185};
167 186
187static inline u32 tun_hashfn(u32 rxhash)
188{
189 return rxhash & 0x3ff;
190}
191
192static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
193{
194 struct tun_flow_entry *e;
195 struct hlist_node *n;
196
197 hlist_for_each_entry_rcu(e, n, head, hash_link) {
198 if (e->rxhash == rxhash)
199 return e;
200 }
201 return NULL;
202}
203
204static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
205 struct hlist_head *head,
206 u32 rxhash, u16 queue_index)
207{
208 struct tun_flow_entry *e = kmem_cache_alloc(tun->flow_cache,
209 GFP_ATOMIC);
210 if (e) {
211 tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
212 rxhash, queue_index);
213 e->updated = jiffies;
214 e->rxhash = rxhash;
215 e->queue_index = queue_index;
216 e->tun = tun;
217 hlist_add_head_rcu(&e->hash_link, head);
218 }
219 return e;
220}
221
222static void tun_flow_free(struct rcu_head *head)
223{
224 struct tun_flow_entry *e
225 = container_of(head, struct tun_flow_entry, rcu);
226 kmem_cache_free(e->tun->flow_cache, e);
227}
228
229static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
230{
231 tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
232 e->rxhash, e->queue_index);
233 hlist_del_rcu(&e->hash_link);
234 call_rcu(&e->rcu, tun_flow_free);
235}
236
237static void tun_flow_flush(struct tun_struct *tun)
238{
239 int i;
240
241 spin_lock_bh(&tun->lock);
242 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
243 struct tun_flow_entry *e;
244 struct hlist_node *h, *n;
245
246 hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link)
247 tun_flow_delete(tun, e);
248 }
249 spin_unlock_bh(&tun->lock);
250}
251
252static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
253{
254 int i;
255
256 spin_lock_bh(&tun->lock);
257 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
258 struct tun_flow_entry *e;
259 struct hlist_node *h, *n;
260
261 hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) {
262 if (e->queue_index == queue_index)
263 tun_flow_delete(tun, e);
264 }
265 }
266 spin_unlock_bh(&tun->lock);
267}
268
269static void tun_flow_cleanup(unsigned long data)
270{
271 struct tun_struct *tun = (struct tun_struct *)data;
272 unsigned long delay = tun->ageing_time;
273 unsigned long next_timer = jiffies + delay;
274 unsigned long count = 0;
275 int i;
276
277 tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");
278
279 spin_lock_bh(&tun->lock);
280 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
281 struct tun_flow_entry *e;
282 struct hlist_node *h, *n;
283
284 hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) {
285 unsigned long this_timer;
286 count++;
287 this_timer = e->updated + delay;
288 if (time_before_eq(this_timer, jiffies))
289 tun_flow_delete(tun, e);
290 else if (time_before(this_timer, next_timer))
291 next_timer = this_timer;
292 }
293 }
294
295 if (count)
296 mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
297 spin_unlock_bh(&tun->lock);
298}
299
300static void tun_flow_update(struct tun_struct *tun, struct sk_buff *skb,
301 u16 queue_index)
302{
303 struct hlist_head *head;
304 struct tun_flow_entry *e;
305 unsigned long delay = tun->ageing_time;
306 u32 rxhash = skb_get_rxhash(skb);
307
308 if (!rxhash)
309 return;
310 else
311 head = &tun->flows[tun_hashfn(rxhash)];
312
313 rcu_read_lock();
314
315 if (tun->numqueues == 1)
316 goto unlock;
317
318 e = tun_flow_find(head, rxhash);
319 if (likely(e)) {
320 /* TODO: keep queueing to old queue until it's empty? */
321 e->queue_index = queue_index;
322 e->updated = jiffies;
323 } else {
324 spin_lock_bh(&tun->lock);
325 if (!tun_flow_find(head, rxhash))
326 tun_flow_create(tun, head, rxhash, queue_index);
327
328 if (!timer_pending(&tun->flow_gc_timer))
329 mod_timer(&tun->flow_gc_timer,
330 round_jiffies_up(jiffies + delay));
331 spin_unlock_bh(&tun->lock);
332 }
333
334unlock:
335 rcu_read_unlock();
336}
337
168/* We try to identify a flow through its rxhash first. The reason that 338/* We try to identify a flow through its rxhash first. The reason that
169 * we do not check rxq no. is becuase some cards(e.g 82599), chooses 339 * we do not check rxq no. is becuase some cards(e.g 82599), chooses
170 * the rxq based on the txq where the last packet of the flow comes. As 340 * the rxq based on the txq where the last packet of the flow comes. As
@@ -175,6 +345,7 @@ struct tun_struct {
175static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) 345static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
176{ 346{
177 struct tun_struct *tun = netdev_priv(dev); 347 struct tun_struct *tun = netdev_priv(dev);
348 struct tun_flow_entry *e;
178 u32 txq = 0; 349 u32 txq = 0;
179 u32 numqueues = 0; 350 u32 numqueues = 0;
180 351
@@ -183,8 +354,12 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
183 354
184 txq = skb_get_rxhash(skb); 355 txq = skb_get_rxhash(skb);
185 if (txq) { 356 if (txq) {
186 /* use multiply and shift instead of expensive divide */ 357 e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
187 txq = ((u64)txq * numqueues) >> 32; 358 if (e)
359 txq = e->queue_index;
360 else
361 /* use multiply and shift instead of expensive divide */
362 txq = ((u64)txq * numqueues) >> 32;
188 } else if (likely(skb_rx_queue_recorded(skb))) { 363 } else if (likely(skb_rx_queue_recorded(skb))) {
189 txq = skb_get_rx_queue(skb); 364 txq = skb_get_rx_queue(skb);
190 while (unlikely(txq >= numqueues)) 365 while (unlikely(txq >= numqueues))
@@ -234,6 +409,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
234 sock_put(&tfile->sk); 409 sock_put(&tfile->sk);
235 410
236 synchronize_net(); 411 synchronize_net();
412 tun_flow_delete_by_queue(tun, tun->numqueues + 1);
237 /* Drop read queue */ 413 /* Drop read queue */
238 skb_queue_purge(&tfile->sk.sk_receive_queue); 414 skb_queue_purge(&tfile->sk.sk_receive_queue);
239 tun_set_real_num_queues(tun); 415 tun_set_real_num_queues(tun);
@@ -631,6 +807,37 @@ static const struct net_device_ops tap_netdev_ops = {
631#endif 807#endif
632}; 808};
633 809
810static int tun_flow_init(struct tun_struct *tun)
811{
812 int i;
813
814 tun->flow_cache = kmem_cache_create("tun_flow_cache",
815 sizeof(struct tun_flow_entry), 0, 0,
816 NULL);
817 if (!tun->flow_cache)
818 return -ENOMEM;
819
820 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
821 INIT_HLIST_HEAD(&tun->flows[i]);
822
823 tun->ageing_time = TUN_FLOW_EXPIRE;
824 setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
825 mod_timer(&tun->flow_gc_timer,
826 round_jiffies_up(jiffies + tun->ageing_time));
827
828 return 0;
829}
830
831static void tun_flow_uninit(struct tun_struct *tun)
832{
833 del_timer_sync(&tun->flow_gc_timer);
834 tun_flow_flush(tun);
835
836 /* Wait for completion of call_rcu()'s */
837 rcu_barrier();
838 kmem_cache_destroy(tun->flow_cache);
839}
840
634/* Initialize net device. */ 841/* Initialize net device. */
635static void tun_net_init(struct net_device *dev) 842static void tun_net_init(struct net_device *dev)
636{ 843{
@@ -973,6 +1180,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
973 tun->dev->stats.rx_packets++; 1180 tun->dev->stats.rx_packets++;
974 tun->dev->stats.rx_bytes += len; 1181 tun->dev->stats.rx_bytes += len;
975 1182
1183 tun_flow_update(tun, skb, tfile->queue_index);
976 return total_len; 1184 return total_len;
977} 1185}
978 1186
@@ -1150,6 +1358,14 @@ out:
1150 return ret; 1358 return ret;
1151} 1359}
1152 1360
1361static void tun_free_netdev(struct net_device *dev)
1362{
1363 struct tun_struct *tun = netdev_priv(dev);
1364
1365 tun_flow_uninit(tun);
1366 free_netdev(dev);
1367}
1368
1153static void tun_setup(struct net_device *dev) 1369static void tun_setup(struct net_device *dev)
1154{ 1370{
1155 struct tun_struct *tun = netdev_priv(dev); 1371 struct tun_struct *tun = netdev_priv(dev);
@@ -1158,7 +1374,7 @@ static void tun_setup(struct net_device *dev)
1158 tun->group = INVALID_GID; 1374 tun->group = INVALID_GID;
1159 1375
1160 dev->ethtool_ops = &tun_ethtool_ops; 1376 dev->ethtool_ops = &tun_ethtool_ops;
1161 dev->destructor = free_netdev; 1377 dev->destructor = tun_free_netdev;
1162} 1378}
1163 1379
1164/* Trivial set of netlink ops to allow deleting tun or tap 1380/* Trivial set of netlink ops to allow deleting tun or tap
@@ -1381,10 +1597,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
1381 tun->filter_attached = false; 1597 tun->filter_attached = false;
1382 tun->sndbuf = tfile->socket.sk->sk_sndbuf; 1598 tun->sndbuf = tfile->socket.sk->sk_sndbuf;
1383 1599
1600 spin_lock_init(&tun->lock);
1601
1384 security_tun_dev_post_create(&tfile->sk); 1602 security_tun_dev_post_create(&tfile->sk);
1385 1603
1386 tun_net_init(dev); 1604 tun_net_init(dev);
1387 1605
1606 if (tun_flow_init(tun))
1607 goto err_free_dev;
1608
1388 dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | 1609 dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
1389 TUN_USER_FEATURES; 1610 TUN_USER_FEATURES;
1390 dev->features = dev->hw_features; 1611 dev->features = dev->hw_features;