aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-06-28 10:40:57 -0400
committerDavid S. Miller <davem@davemloft.net>2013-07-01 21:07:15 -0400
commitaec0a40a6f78843c0ce73f7398230ee5184f896d (patch)
treeb9a68c7c98b2d036ba7396c55f717d17c50f12a4 /net/sched
parentd0b5e516298fba30774e2df22cfbd00ecb09c298 (diff)
netem: use rb tree to implement the time queue
Following typical setup to implement a ~100 ms RTT and big amount of reorders has very poor performance because netem implements the time queue using a linked list. ----------------------------------------------------------- ETH=eth0 IFB=ifb0 modprobe ifb ip link set dev $IFB up tc qdisc add dev $ETH ingress 2>/dev/null tc filter add dev $ETH parent ffff: \ protocol ip u32 match u32 0 0 flowid 1:1 action mirred egress \ redirect dev $IFB ethtool -K $ETH gro off tso off gso off tc qdisc add dev $IFB root netem delay 50ms 10ms limit 100000 tc qd add dev $ETH root netem delay 50ms limit 100000 --------------------------------------------------------- Switch netem time queue to a rb tree, so this kind of setup can work at high speed. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/sch_netem.c109
1 files changed, 85 insertions, 24 deletions
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 3d2acc7a9c80..ed0082cf8eff 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -23,6 +23,7 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/rtnetlink.h> 24#include <linux/rtnetlink.h>
25#include <linux/reciprocal_div.h> 25#include <linux/reciprocal_div.h>
26#include <linux/rbtree.h>
26 27
27#include <net/netlink.h> 28#include <net/netlink.h>
28#include <net/pkt_sched.h> 29#include <net/pkt_sched.h>
@@ -68,7 +69,8 @@
68*/ 69*/
69 70
70struct netem_sched_data { 71struct netem_sched_data {
71 /* internal t(ime)fifo qdisc uses sch->q and sch->limit */ 72 /* internal t(ime)fifo qdisc uses t_root and sch->limit */
73 struct rb_root t_root;
72 74
73 /* optional qdisc for classful handling (NULL at netem init) */ 75 /* optional qdisc for classful handling (NULL at netem init) */
74 struct Qdisc *qdisc; 76 struct Qdisc *qdisc;
@@ -128,10 +130,35 @@ struct netem_sched_data {
128 */ 130 */
129struct netem_skb_cb { 131struct netem_skb_cb {
130 psched_time_t time_to_send; 132 psched_time_t time_to_send;
133 ktime_t tstamp_save;
131}; 134};
132 135
136/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
137 * to hold a rb_node structure.
138 *
139 * If struct sk_buff layout is changed, the following checks will complain.
140 */
141static struct rb_node *netem_rb_node(struct sk_buff *skb)
142{
143 BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
144 BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
145 offsetof(struct sk_buff, next) + sizeof(skb->next));
146 BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
147 offsetof(struct sk_buff, prev) + sizeof(skb->prev));
148 BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
149 sizeof(skb->prev) +
150 sizeof(skb->tstamp));
151 return (struct rb_node *)&skb->next;
152}
153
154static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
155{
156 return (struct sk_buff *)rb;
157}
158
133static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 159static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
134{ 160{
161 /* we assume we can use skb next/prev/tstamp as storage for rb_node */
135 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 162 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
136 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 163 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
137} 164}
@@ -333,20 +360,23 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
333 360
334static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 361static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
335{ 362{
336 struct sk_buff_head *list = &sch->q; 363 struct netem_sched_data *q = qdisc_priv(sch);
337 psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; 364 psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
338 struct sk_buff *skb = skb_peek_tail(list); 365 struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
339 366
340 /* Optimize for add at tail */ 367 while (*p) {
341 if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send)) 368 struct sk_buff *skb;
342 return __skb_queue_tail(list, nskb);
343 369
344 skb_queue_reverse_walk(list, skb) { 370 parent = *p;
371 skb = netem_rb_to_skb(parent);
345 if (tnext >= netem_skb_cb(skb)->time_to_send) 372 if (tnext >= netem_skb_cb(skb)->time_to_send)
346 break; 373 p = &parent->rb_right;
374 else
375 p = &parent->rb_left;
347 } 376 }
348 377 rb_link_node(netem_rb_node(nskb), parent, p);
349 __skb_queue_after(list, skb, nskb); 378 rb_insert_color(netem_rb_node(nskb), &q->t_root);
379 sch->q.qlen++;
350} 380}
351 381
352/* 382/*
@@ -436,23 +466,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
436 now = psched_get_time(); 466 now = psched_get_time();
437 467
438 if (q->rate) { 468 if (q->rate) {
439 struct sk_buff_head *list = &sch->q; 469 struct sk_buff *last;
440 470
441 if (!skb_queue_empty(list)) { 471 if (!skb_queue_empty(&sch->q))
472 last = skb_peek_tail(&sch->q);
473 else
474 last = netem_rb_to_skb(rb_last(&q->t_root));
475 if (last) {
442 /* 476 /*
443 * Last packet in queue is reference point (now), 477 * Last packet in queue is reference point (now),
444 * calculate this time bonus and subtract 478 * calculate this time bonus and subtract
445 * from delay. 479 * from delay.
446 */ 480 */
447 delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now; 481 delay -= netem_skb_cb(last)->time_to_send - now;
448 delay = max_t(psched_tdiff_t, 0, delay); 482 delay = max_t(psched_tdiff_t, 0, delay);
449 now = netem_skb_cb(skb_peek_tail(list))->time_to_send; 483 now = netem_skb_cb(last)->time_to_send;
450 } 484 }
451 485
452 delay += packet_len_2_sched_time(skb->len, q); 486 delay += packet_len_2_sched_time(skb->len, q);
453 } 487 }
454 488
455 cb->time_to_send = now + delay; 489 cb->time_to_send = now + delay;
490 cb->tstamp_save = skb->tstamp;
456 ++q->counter; 491 ++q->counter;
457 tfifo_enqueue(skb, sch); 492 tfifo_enqueue(skb, sch);
458 } else { 493 } else {
@@ -476,6 +511,21 @@ static unsigned int netem_drop(struct Qdisc *sch)
476 unsigned int len; 511 unsigned int len;
477 512
478 len = qdisc_queue_drop(sch); 513 len = qdisc_queue_drop(sch);
514
515 if (!len) {
516 struct rb_node *p = rb_first(&q->t_root);
517
518 if (p) {
519 struct sk_buff *skb = netem_rb_to_skb(p);
520
521 rb_erase(p, &q->t_root);
522 sch->q.qlen--;
523 skb->next = NULL;
524 skb->prev = NULL;
525 len = qdisc_pkt_len(skb);
526 kfree_skb(skb);
527 }
528 }
479 if (!len && q->qdisc && q->qdisc->ops->drop) 529 if (!len && q->qdisc && q->qdisc->ops->drop)
480 len = q->qdisc->ops->drop(q->qdisc); 530 len = q->qdisc->ops->drop(q->qdisc);
481 if (len) 531 if (len)
@@ -488,19 +538,32 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
488{ 538{
489 struct netem_sched_data *q = qdisc_priv(sch); 539 struct netem_sched_data *q = qdisc_priv(sch);
490 struct sk_buff *skb; 540 struct sk_buff *skb;
541 struct rb_node *p;
491 542
492 if (qdisc_is_throttled(sch)) 543 if (qdisc_is_throttled(sch))
493 return NULL; 544 return NULL;
494 545
495tfifo_dequeue: 546tfifo_dequeue:
496 skb = qdisc_peek_head(sch); 547 skb = __skb_dequeue(&sch->q);
497 if (skb) { 548 if (skb) {
498 const struct netem_skb_cb *cb = netem_skb_cb(skb); 549deliver:
550 sch->qstats.backlog -= qdisc_pkt_len(skb);
551 qdisc_unthrottled(sch);
552 qdisc_bstats_update(sch, skb);
553 return skb;
554 }
555 p = rb_first(&q->t_root);
556 if (p) {
557 skb = netem_rb_to_skb(p);
499 558
500 /* if more time remaining? */ 559 /* if more time remaining? */
501 if (cb->time_to_send <= psched_get_time()) { 560 if (netem_skb_cb(skb)->time_to_send <= psched_get_time()) {
502 __skb_unlink(skb, &sch->q); 561 rb_erase(p, &q->t_root);
503 sch->qstats.backlog -= qdisc_pkt_len(skb); 562
563 sch->q.qlen--;
564 skb->next = NULL;
565 skb->prev = NULL;
566 skb->tstamp = netem_skb_cb(skb)->tstamp_save;
504 567
505#ifdef CONFIG_NET_CLS_ACT 568#ifdef CONFIG_NET_CLS_ACT
506 /* 569 /*
@@ -522,10 +585,7 @@ tfifo_dequeue:
522 } 585 }
523 goto tfifo_dequeue; 586 goto tfifo_dequeue;
524 } 587 }
525deliver: 588 goto deliver;
526 qdisc_unthrottled(sch);
527 qdisc_bstats_update(sch, skb);
528 return skb;
529 } 589 }
530 590
531 if (q->qdisc) { 591 if (q->qdisc) {
@@ -533,7 +593,8 @@ deliver:
533 if (skb) 593 if (skb)
534 goto deliver; 594 goto deliver;
535 } 595 }
536 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); 596 qdisc_watchdog_schedule(&q->watchdog,
597 netem_skb_cb(skb)->time_to_send);
537 } 598 }
538 599
539 if (q->qdisc) { 600 if (q->qdisc) {