diff options
author | Eric Dumazet <edumazet@google.com> | 2013-06-28 10:40:57 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-07-01 21:07:15 -0400 |
commit | aec0a40a6f78843c0ce73f7398230ee5184f896d (patch) | |
tree | b9a68c7c98b2d036ba7396c55f717d17c50f12a4 /net/sched | |
parent | d0b5e516298fba30774e2df22cfbd00ecb09c298 (diff) |
netem: use rb tree to implement the time queue
Following typical setup to implement a ~100 ms RTT and big
amount of reorders has very poor performance because netem
implements the time queue using a linked list.
-----------------------------------------------------------
ETH=eth0
IFB=ifb0
modprobe ifb
ip link set dev $IFB up
tc qdisc add dev $ETH ingress 2>/dev/null
tc filter add dev $ETH parent ffff: \
protocol ip u32 match u32 0 0 flowid 1:1 action mirred egress \
redirect dev $IFB
ethtool -K $ETH gro off tso off gso off
tc qdisc add dev $IFB root netem delay 50ms 10ms limit 100000
tc qd add dev $ETH root netem delay 50ms limit 100000
---------------------------------------------------------
Switch netem time queue to a rb tree, so this kind of setup can work at
high speed.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/sch_netem.c | 109 |
1 files changed, 85 insertions, 24 deletions
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 3d2acc7a9c80..ed0082cf8eff 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
24 | #include <linux/rtnetlink.h> | 24 | #include <linux/rtnetlink.h> |
25 | #include <linux/reciprocal_div.h> | 25 | #include <linux/reciprocal_div.h> |
26 | #include <linux/rbtree.h> | ||
26 | 27 | ||
27 | #include <net/netlink.h> | 28 | #include <net/netlink.h> |
28 | #include <net/pkt_sched.h> | 29 | #include <net/pkt_sched.h> |
@@ -68,7 +69,8 @@ | |||
68 | */ | 69 | */ |
69 | 70 | ||
70 | struct netem_sched_data { | 71 | struct netem_sched_data { |
71 | /* internal t(ime)fifo qdisc uses sch->q and sch->limit */ | 72 | /* internal t(ime)fifo qdisc uses t_root and sch->limit */ |
73 | struct rb_root t_root; | ||
72 | 74 | ||
73 | /* optional qdisc for classful handling (NULL at netem init) */ | 75 | /* optional qdisc for classful handling (NULL at netem init) */ |
74 | struct Qdisc *qdisc; | 76 | struct Qdisc *qdisc; |
@@ -128,10 +130,35 @@ struct netem_sched_data { | |||
128 | */ | 130 | */ |
129 | struct netem_skb_cb { | 131 | struct netem_skb_cb { |
130 | psched_time_t time_to_send; | 132 | psched_time_t time_to_send; |
133 | ktime_t tstamp_save; | ||
131 | }; | 134 | }; |
132 | 135 | ||
136 | /* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp | ||
137 | * to hold a rb_node structure. | ||
138 | * | ||
139 | * If struct sk_buff layout is changed, the following checks will complain. | ||
140 | */ | ||
141 | static struct rb_node *netem_rb_node(struct sk_buff *skb) | ||
142 | { | ||
143 | BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0); | ||
144 | BUILD_BUG_ON(offsetof(struct sk_buff, prev) != | ||
145 | offsetof(struct sk_buff, next) + sizeof(skb->next)); | ||
146 | BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) != | ||
147 | offsetof(struct sk_buff, prev) + sizeof(skb->prev)); | ||
148 | BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) + | ||
149 | sizeof(skb->prev) + | ||
150 | sizeof(skb->tstamp)); | ||
151 | return (struct rb_node *)&skb->next; | ||
152 | } | ||
153 | |||
154 | static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) | ||
155 | { | ||
156 | return (struct sk_buff *)rb; | ||
157 | } | ||
158 | |||
133 | static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) | 159 | static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) |
134 | { | 160 | { |
161 | /* we assume we can use skb next/prev/tstamp as storage for rb_node */ | ||
135 | qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); | 162 | qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); |
136 | return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; | 163 | return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; |
137 | } | 164 | } |
@@ -333,20 +360,23 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche | |||
333 | 360 | ||
334 | static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) | 361 | static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) |
335 | { | 362 | { |
336 | struct sk_buff_head *list = &sch->q; | 363 | struct netem_sched_data *q = qdisc_priv(sch); |
337 | psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; | 364 | psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; |
338 | struct sk_buff *skb = skb_peek_tail(list); | 365 | struct rb_node **p = &q->t_root.rb_node, *parent = NULL; |
339 | 366 | ||
340 | /* Optimize for add at tail */ | 367 | while (*p) { |
341 | if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send)) | 368 | struct sk_buff *skb; |
342 | return __skb_queue_tail(list, nskb); | ||
343 | 369 | ||
344 | skb_queue_reverse_walk(list, skb) { | 370 | parent = *p; |
371 | skb = netem_rb_to_skb(parent); | ||
345 | if (tnext >= netem_skb_cb(skb)->time_to_send) | 372 | if (tnext >= netem_skb_cb(skb)->time_to_send) |
346 | break; | 373 | p = &parent->rb_right; |
374 | else | ||
375 | p = &parent->rb_left; | ||
347 | } | 376 | } |
348 | 377 | rb_link_node(netem_rb_node(nskb), parent, p); | |
349 | __skb_queue_after(list, skb, nskb); | 378 | rb_insert_color(netem_rb_node(nskb), &q->t_root); |
379 | sch->q.qlen++; | ||
350 | } | 380 | } |
351 | 381 | ||
352 | /* | 382 | /* |
@@ -436,23 +466,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
436 | now = psched_get_time(); | 466 | now = psched_get_time(); |
437 | 467 | ||
438 | if (q->rate) { | 468 | if (q->rate) { |
439 | struct sk_buff_head *list = &sch->q; | 469 | struct sk_buff *last; |
440 | 470 | ||
441 | if (!skb_queue_empty(list)) { | 471 | if (!skb_queue_empty(&sch->q)) |
472 | last = skb_peek_tail(&sch->q); | ||
473 | else | ||
474 | last = netem_rb_to_skb(rb_last(&q->t_root)); | ||
475 | if (last) { | ||
442 | /* | 476 | /* |
443 | * Last packet in queue is reference point (now), | 477 | * Last packet in queue is reference point (now), |
444 | * calculate this time bonus and subtract | 478 | * calculate this time bonus and subtract |
445 | * from delay. | 479 | * from delay. |
446 | */ | 480 | */ |
447 | delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now; | 481 | delay -= netem_skb_cb(last)->time_to_send - now; |
448 | delay = max_t(psched_tdiff_t, 0, delay); | 482 | delay = max_t(psched_tdiff_t, 0, delay); |
449 | now = netem_skb_cb(skb_peek_tail(list))->time_to_send; | 483 | now = netem_skb_cb(last)->time_to_send; |
450 | } | 484 | } |
451 | 485 | ||
452 | delay += packet_len_2_sched_time(skb->len, q); | 486 | delay += packet_len_2_sched_time(skb->len, q); |
453 | } | 487 | } |
454 | 488 | ||
455 | cb->time_to_send = now + delay; | 489 | cb->time_to_send = now + delay; |
490 | cb->tstamp_save = skb->tstamp; | ||
456 | ++q->counter; | 491 | ++q->counter; |
457 | tfifo_enqueue(skb, sch); | 492 | tfifo_enqueue(skb, sch); |
458 | } else { | 493 | } else { |
@@ -476,6 +511,21 @@ static unsigned int netem_drop(struct Qdisc *sch) | |||
476 | unsigned int len; | 511 | unsigned int len; |
477 | 512 | ||
478 | len = qdisc_queue_drop(sch); | 513 | len = qdisc_queue_drop(sch); |
514 | |||
515 | if (!len) { | ||
516 | struct rb_node *p = rb_first(&q->t_root); | ||
517 | |||
518 | if (p) { | ||
519 | struct sk_buff *skb = netem_rb_to_skb(p); | ||
520 | |||
521 | rb_erase(p, &q->t_root); | ||
522 | sch->q.qlen--; | ||
523 | skb->next = NULL; | ||
524 | skb->prev = NULL; | ||
525 | len = qdisc_pkt_len(skb); | ||
526 | kfree_skb(skb); | ||
527 | } | ||
528 | } | ||
479 | if (!len && q->qdisc && q->qdisc->ops->drop) | 529 | if (!len && q->qdisc && q->qdisc->ops->drop) |
480 | len = q->qdisc->ops->drop(q->qdisc); | 530 | len = q->qdisc->ops->drop(q->qdisc); |
481 | if (len) | 531 | if (len) |
@@ -488,19 +538,32 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) | |||
488 | { | 538 | { |
489 | struct netem_sched_data *q = qdisc_priv(sch); | 539 | struct netem_sched_data *q = qdisc_priv(sch); |
490 | struct sk_buff *skb; | 540 | struct sk_buff *skb; |
541 | struct rb_node *p; | ||
491 | 542 | ||
492 | if (qdisc_is_throttled(sch)) | 543 | if (qdisc_is_throttled(sch)) |
493 | return NULL; | 544 | return NULL; |
494 | 545 | ||
495 | tfifo_dequeue: | 546 | tfifo_dequeue: |
496 | skb = qdisc_peek_head(sch); | 547 | skb = __skb_dequeue(&sch->q); |
497 | if (skb) { | 548 | if (skb) { |
498 | const struct netem_skb_cb *cb = netem_skb_cb(skb); | 549 | deliver: |
550 | sch->qstats.backlog -= qdisc_pkt_len(skb); | ||
551 | qdisc_unthrottled(sch); | ||
552 | qdisc_bstats_update(sch, skb); | ||
553 | return skb; | ||
554 | } | ||
555 | p = rb_first(&q->t_root); | ||
556 | if (p) { | ||
557 | skb = netem_rb_to_skb(p); | ||
499 | 558 | ||
500 | /* if more time remaining? */ | 559 | /* if more time remaining? */ |
501 | if (cb->time_to_send <= psched_get_time()) { | 560 | if (netem_skb_cb(skb)->time_to_send <= psched_get_time()) { |
502 | __skb_unlink(skb, &sch->q); | 561 | rb_erase(p, &q->t_root); |
503 | sch->qstats.backlog -= qdisc_pkt_len(skb); | 562 | |
563 | sch->q.qlen--; | ||
564 | skb->next = NULL; | ||
565 | skb->prev = NULL; | ||
566 | skb->tstamp = netem_skb_cb(skb)->tstamp_save; | ||
504 | 567 | ||
505 | #ifdef CONFIG_NET_CLS_ACT | 568 | #ifdef CONFIG_NET_CLS_ACT |
506 | /* | 569 | /* |
@@ -522,10 +585,7 @@ tfifo_dequeue: | |||
522 | } | 585 | } |
523 | goto tfifo_dequeue; | 586 | goto tfifo_dequeue; |
524 | } | 587 | } |
525 | deliver: | 588 | goto deliver; |
526 | qdisc_unthrottled(sch); | ||
527 | qdisc_bstats_update(sch, skb); | ||
528 | return skb; | ||
529 | } | 589 | } |
530 | 590 | ||
531 | if (q->qdisc) { | 591 | if (q->qdisc) { |
@@ -533,7 +593,8 @@ deliver: | |||
533 | if (skb) | 593 | if (skb) |
534 | goto deliver; | 594 | goto deliver; |
535 | } | 595 | } |
536 | qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); | 596 | qdisc_watchdog_schedule(&q->watchdog, |
597 | netem_skb_cb(skb)->time_to_send); | ||
537 | } | 598 | } |
538 | 599 | ||
539 | if (q->qdisc) { | 600 | if (q->qdisc) { |