diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2012-01-06 01:31:44 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-01-12 23:05:28 -0500 |
commit | ddecf0f4db44ef94847a62d6ecf74456b4dcc66f (patch) | |
tree | 48c49b70f83d713c7e3f594e067ee726bb70777e /net/sched/sch_sfq.c | |
parent | 72092cc45378176ba700034c91b7af2db524df26 (diff) |
net_sched: sfq: add optional RED on top of SFQ
Adds an optional Random Early Detection on each SFQ flow queue.
Traditional SFQ limits count of packets, while RED permits to also
control number of bytes per flow, and adds ECN capability as well.
1) We dont handle the idle time management in this RED implementation,
since each 'new flow' begins with a null qavg. We really want to address
backlogged flows.
2) if headdrop is selected, we try to ecn mark first packet instead of
currently enqueued packet. This gives faster feedback for tcp flows
compared to traditional RED [ marking the last packet in queue ]
Example of use :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
limit 3000 headdrop flows 512 divisor 16384 \
redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
ewma 6 min 8000b max 60000b probability 0.2 ecn
prob_mark 0 prob_mark_head 4876 prob_drop 6131
forced_mark 0 forced_mark_head 0 forced_drop 0
Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
requeues 0)
rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
flows, we can see number of packets CE marked is smaller than number of
drops (for non ECN flows)
If same test is run, without RED, we can check backlog is much bigger.
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Dave Taht <dave.taht@gmail.com>
Tested-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched/sch_sfq.c')
-rw-r--r-- | net/sched/sch_sfq.c | 146 |
1 files changed, 130 insertions, 16 deletions
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 0a7964009e8c..67494aef9acf 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <net/netlink.h> | 24 | #include <net/netlink.h> |
25 | #include <net/pkt_sched.h> | 25 | #include <net/pkt_sched.h> |
26 | #include <net/flow_keys.h> | 26 | #include <net/flow_keys.h> |
27 | #include <net/red.h> | ||
27 | 28 | ||
28 | 29 | ||
29 | /* Stochastic Fairness Queuing algorithm. | 30 | /* Stochastic Fairness Queuing algorithm. |
@@ -108,24 +109,30 @@ struct sfq_slot { | |||
108 | struct sfq_head dep; /* anchor in dep[] chains */ | 109 | struct sfq_head dep; /* anchor in dep[] chains */ |
109 | unsigned short hash; /* hash value (index in ht[]) */ | 110 | unsigned short hash; /* hash value (index in ht[]) */ |
110 | short allot; /* credit for this slot */ | 111 | short allot; /* credit for this slot */ |
112 | |||
113 | unsigned int backlog; | ||
114 | struct red_vars vars; | ||
111 | }; | 115 | }; |
112 | 116 | ||
113 | struct sfq_sched_data { | 117 | struct sfq_sched_data { |
114 | /* frequently used fields */ | 118 | /* frequently used fields */ |
115 | int limit; /* limit of total number of packets in this qdisc */ | 119 | int limit; /* limit of total number of packets in this qdisc */ |
116 | unsigned int divisor; /* number of slots in hash table */ | 120 | unsigned int divisor; /* number of slots in hash table */ |
117 | unsigned int maxflows; /* number of flows in flows array */ | 121 | u8 headdrop; |
118 | int headdrop; | 122 | u8 maxdepth; /* limit of packets per flow */ |
119 | int maxdepth; /* limit of packets per flow */ | ||
120 | 123 | ||
121 | u32 perturbation; | 124 | u32 perturbation; |
122 | struct tcf_proto *filter_list; | 125 | u8 cur_depth; /* depth of longest slot */ |
123 | sfq_index cur_depth; /* depth of longest slot */ | 126 | u8 flags; |
124 | unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ | 127 | unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ |
125 | struct sfq_slot *tail; /* current slot in round */ | 128 | struct tcf_proto *filter_list; |
126 | sfq_index *ht; /* Hash table ('divisor' slots) */ | 129 | sfq_index *ht; /* Hash table ('divisor' slots) */ |
127 | struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ | 130 | struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ |
128 | 131 | ||
132 | struct red_parms *red_parms; | ||
133 | struct tc_sfqred_stats stats; | ||
134 | struct sfq_slot *tail; /* current slot in round */ | ||
135 | |||
129 | struct sfq_head dep[SFQ_MAX_DEPTH + 1]; | 136 | struct sfq_head dep[SFQ_MAX_DEPTH + 1]; |
130 | /* Linked lists of slots, indexed by depth | 137 | /* Linked lists of slots, indexed by depth |
131 | * dep[0] : list of unused flows | 138 | * dep[0] : list of unused flows |
@@ -133,6 +140,7 @@ struct sfq_sched_data { | |||
133 | * dep[X] : list of flows with X packets | 140 | * dep[X] : list of flows with X packets |
134 | */ | 141 | */ |
135 | 142 | ||
143 | unsigned int maxflows; /* number of flows in flows array */ | ||
136 | int perturb_period; | 144 | int perturb_period; |
137 | unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ | 145 | unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ |
138 | struct timer_list perturb_timer; | 146 | struct timer_list perturb_timer; |
@@ -321,6 +329,7 @@ static unsigned int sfq_drop(struct Qdisc *sch) | |||
321 | drop: | 329 | drop: |
322 | skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); | 330 | skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); |
323 | len = qdisc_pkt_len(skb); | 331 | len = qdisc_pkt_len(skb); |
332 | slot->backlog -= len; | ||
324 | sfq_dec(q, x); | 333 | sfq_dec(q, x); |
325 | kfree_skb(skb); | 334 | kfree_skb(skb); |
326 | sch->q.qlen--; | 335 | sch->q.qlen--; |
@@ -341,6 +350,23 @@ drop: | |||
341 | return 0; | 350 | return 0; |
342 | } | 351 | } |
343 | 352 | ||
353 | /* Is ECN parameter configured */ | ||
354 | static int sfq_prob_mark(const struct sfq_sched_data *q) | ||
355 | { | ||
356 | return q->flags & TC_RED_ECN; | ||
357 | } | ||
358 | |||
359 | /* Should packets over max threshold just be marked */ | ||
360 | static int sfq_hard_mark(const struct sfq_sched_data *q) | ||
361 | { | ||
362 | return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; | ||
363 | } | ||
364 | |||
365 | static int sfq_headdrop(const struct sfq_sched_data *q) | ||
366 | { | ||
367 | return q->headdrop; | ||
368 | } | ||
369 | |||
344 | static int | 370 | static int |
345 | sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | 371 | sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) |
346 | { | 372 | { |
@@ -349,6 +375,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
349 | sfq_index x, qlen; | 375 | sfq_index x, qlen; |
350 | struct sfq_slot *slot; | 376 | struct sfq_slot *slot; |
351 | int uninitialized_var(ret); | 377 | int uninitialized_var(ret); |
378 | struct sk_buff *head; | ||
379 | int delta; | ||
352 | 380 | ||
353 | hash = sfq_classify(skb, sch, &ret); | 381 | hash = sfq_classify(skb, sch, &ret); |
354 | if (hash == 0) { | 382 | if (hash == 0) { |
@@ -368,24 +396,75 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
368 | q->ht[hash] = x; | 396 | q->ht[hash] = x; |
369 | slot = &q->slots[x]; | 397 | slot = &q->slots[x]; |
370 | slot->hash = hash; | 398 | slot->hash = hash; |
399 | slot->backlog = 0; /* should already be 0 anyway... */ | ||
400 | red_set_vars(&slot->vars); | ||
401 | goto enqueue; | ||
371 | } | 402 | } |
403 | if (q->red_parms) { | ||
404 | slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, | ||
405 | &slot->vars, | ||
406 | slot->backlog); | ||
407 | switch (red_action(q->red_parms, | ||
408 | &slot->vars, | ||
409 | slot->vars.qavg)) { | ||
410 | case RED_DONT_MARK: | ||
411 | break; | ||
372 | 412 | ||
373 | if (slot->qlen >= q->maxdepth) { | 413 | case RED_PROB_MARK: |
374 | struct sk_buff *head; | 414 | sch->qstats.overlimits++; |
415 | if (sfq_prob_mark(q)) { | ||
416 | /* We know we have at least one packet in queue */ | ||
417 | if (sfq_headdrop(q) && | ||
418 | INET_ECN_set_ce(slot->skblist_next)) { | ||
419 | q->stats.prob_mark_head++; | ||
420 | break; | ||
421 | } | ||
422 | if (INET_ECN_set_ce(skb)) { | ||
423 | q->stats.prob_mark++; | ||
424 | break; | ||
425 | } | ||
426 | } | ||
427 | q->stats.prob_drop++; | ||
428 | goto congestion_drop; | ||
429 | |||
430 | case RED_HARD_MARK: | ||
431 | sch->qstats.overlimits++; | ||
432 | if (sfq_hard_mark(q)) { | ||
433 | /* We know we have at least one packet in queue */ | ||
434 | if (sfq_headdrop(q) && | ||
435 | INET_ECN_set_ce(slot->skblist_next)) { | ||
436 | q->stats.forced_mark_head++; | ||
437 | break; | ||
438 | } | ||
439 | if (INET_ECN_set_ce(skb)) { | ||
440 | q->stats.forced_mark++; | ||
441 | break; | ||
442 | } | ||
443 | } | ||
444 | q->stats.forced_drop++; | ||
445 | goto congestion_drop; | ||
446 | } | ||
447 | } | ||
375 | 448 | ||
376 | if (!q->headdrop) | 449 | if (slot->qlen >= q->maxdepth) { |
450 | congestion_drop: | ||
451 | if (!sfq_headdrop(q)) | ||
377 | return qdisc_drop(skb, sch); | 452 | return qdisc_drop(skb, sch); |
378 | 453 | ||
454 | /* We know we have at least one packet in queue */ | ||
379 | head = slot_dequeue_head(slot); | 455 | head = slot_dequeue_head(slot); |
380 | sch->qstats.backlog -= qdisc_pkt_len(head); | 456 | delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); |
457 | sch->qstats.backlog -= delta; | ||
458 | slot->backlog -= delta; | ||
381 | qdisc_drop(head, sch); | 459 | qdisc_drop(head, sch); |
382 | 460 | ||
383 | sch->qstats.backlog += qdisc_pkt_len(skb); | ||
384 | slot_queue_add(slot, skb); | 461 | slot_queue_add(slot, skb); |
385 | return NET_XMIT_CN; | 462 | return NET_XMIT_CN; |
386 | } | 463 | } |
387 | 464 | ||
465 | enqueue: | ||
388 | sch->qstats.backlog += qdisc_pkt_len(skb); | 466 | sch->qstats.backlog += qdisc_pkt_len(skb); |
467 | slot->backlog += qdisc_pkt_len(skb); | ||
389 | slot_queue_add(slot, skb); | 468 | slot_queue_add(slot, skb); |
390 | sfq_inc(q, x); | 469 | sfq_inc(q, x); |
391 | if (slot->qlen == 1) { /* The flow is new */ | 470 | if (slot->qlen == 1) { /* The flow is new */ |
@@ -396,6 +475,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) | |||
396 | slot->next = q->tail->next; | 475 | slot->next = q->tail->next; |
397 | q->tail->next = x; | 476 | q->tail->next = x; |
398 | } | 477 | } |
478 | /* We could use a bigger initial quantum for new flows */ | ||
399 | slot->allot = q->scaled_quantum; | 479 | slot->allot = q->scaled_quantum; |
400 | } | 480 | } |
401 | if (++sch->q.qlen <= q->limit) | 481 | if (++sch->q.qlen <= q->limit) |
@@ -439,7 +519,7 @@ next_slot: | |||
439 | qdisc_bstats_update(sch, skb); | 519 | qdisc_bstats_update(sch, skb); |
440 | sch->q.qlen--; | 520 | sch->q.qlen--; |
441 | sch->qstats.backlog -= qdisc_pkt_len(skb); | 521 | sch->qstats.backlog -= qdisc_pkt_len(skb); |
442 | 522 | slot->backlog -= qdisc_pkt_len(skb); | |
443 | /* Is the slot empty? */ | 523 | /* Is the slot empty? */ |
444 | if (slot->qlen == 0) { | 524 | if (slot->qlen == 0) { |
445 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; | 525 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; |
@@ -490,6 +570,8 @@ static void sfq_rehash(struct Qdisc *sch) | |||
490 | sfq_dec(q, i); | 570 | sfq_dec(q, i); |
491 | __skb_queue_tail(&list, skb); | 571 | __skb_queue_tail(&list, skb); |
492 | } | 572 | } |
573 | slot->backlog = 0; | ||
574 | red_set_vars(&slot->vars); | ||
493 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; | 575 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; |
494 | } | 576 | } |
495 | q->tail = NULL; | 577 | q->tail = NULL; |
@@ -514,6 +596,11 @@ drop: sch->qstats.backlog -= qdisc_pkt_len(skb); | |||
514 | if (slot->qlen >= q->maxdepth) | 596 | if (slot->qlen >= q->maxdepth) |
515 | goto drop; | 597 | goto drop; |
516 | slot_queue_add(slot, skb); | 598 | slot_queue_add(slot, skb); |
599 | if (q->red_parms) | ||
600 | slot->vars.qavg = red_calc_qavg(q->red_parms, | ||
601 | &slot->vars, | ||
602 | slot->backlog); | ||
603 | slot->backlog += qdisc_pkt_len(skb); | ||
517 | sfq_inc(q, x); | 604 | sfq_inc(q, x); |
518 | if (slot->qlen == 1) { /* The flow is new */ | 605 | if (slot->qlen == 1) { /* The flow is new */ |
519 | if (q->tail == NULL) { /* It is the first flow */ | 606 | if (q->tail == NULL) { /* It is the first flow */ |
@@ -552,6 +639,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) | |||
552 | struct tc_sfq_qopt *ctl = nla_data(opt); | 639 | struct tc_sfq_qopt *ctl = nla_data(opt); |
553 | struct tc_sfq_qopt_v1 *ctl_v1 = NULL; | 640 | struct tc_sfq_qopt_v1 *ctl_v1 = NULL; |
554 | unsigned int qlen; | 641 | unsigned int qlen; |
642 | struct red_parms *p = NULL; | ||
555 | 643 | ||
556 | if (opt->nla_len < nla_attr_size(sizeof(*ctl))) | 644 | if (opt->nla_len < nla_attr_size(sizeof(*ctl))) |
557 | return -EINVAL; | 645 | return -EINVAL; |
@@ -560,7 +648,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) | |||
560 | if (ctl->divisor && | 648 | if (ctl->divisor && |
561 | (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) | 649 | (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) |
562 | return -EINVAL; | 650 | return -EINVAL; |
563 | 651 | if (ctl_v1 && ctl_v1->qth_min) { | |
652 | p = kmalloc(sizeof(*p), GFP_KERNEL); | ||
653 | if (!p) | ||
654 | return -ENOMEM; | ||
655 | } | ||
564 | sch_tree_lock(sch); | 656 | sch_tree_lock(sch); |
565 | if (ctl->quantum) { | 657 | if (ctl->quantum) { |
566 | q->quantum = ctl->quantum; | 658 | q->quantum = ctl->quantum; |
@@ -576,6 +668,16 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) | |||
576 | if (ctl_v1) { | 668 | if (ctl_v1) { |
577 | if (ctl_v1->depth) | 669 | if (ctl_v1->depth) |
578 | q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); | 670 | q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); |
671 | if (p) { | ||
672 | swap(q->red_parms, p); | ||
673 | red_set_parms(q->red_parms, | ||
674 | ctl_v1->qth_min, ctl_v1->qth_max, | ||
675 | ctl_v1->Wlog, | ||
676 | ctl_v1->Plog, ctl_v1->Scell_log, | ||
677 | NULL, | ||
678 | ctl_v1->max_P); | ||
679 | } | ||
680 | q->flags = ctl_v1->flags; | ||
579 | q->headdrop = ctl_v1->headdrop; | 681 | q->headdrop = ctl_v1->headdrop; |
580 | } | 682 | } |
581 | if (ctl->limit) { | 683 | if (ctl->limit) { |
@@ -594,6 +696,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) | |||
594 | q->perturbation = net_random(); | 696 | q->perturbation = net_random(); |
595 | } | 697 | } |
596 | sch_tree_unlock(sch); | 698 | sch_tree_unlock(sch); |
699 | kfree(p); | ||
597 | return 0; | 700 | return 0; |
598 | } | 701 | } |
599 | 702 | ||
@@ -625,6 +728,7 @@ static void sfq_destroy(struct Qdisc *sch) | |||
625 | del_timer_sync(&q->perturb_timer); | 728 | del_timer_sync(&q->perturb_timer); |
626 | sfq_free(q->ht); | 729 | sfq_free(q->ht); |
627 | sfq_free(q->slots); | 730 | sfq_free(q->slots); |
731 | kfree(q->red_parms); | ||
628 | } | 732 | } |
629 | 733 | ||
630 | static int sfq_init(struct Qdisc *sch, struct nlattr *opt) | 734 | static int sfq_init(struct Qdisc *sch, struct nlattr *opt) |
@@ -683,6 +787,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) | |||
683 | struct sfq_sched_data *q = qdisc_priv(sch); | 787 | struct sfq_sched_data *q = qdisc_priv(sch); |
684 | unsigned char *b = skb_tail_pointer(skb); | 788 | unsigned char *b = skb_tail_pointer(skb); |
685 | struct tc_sfq_qopt_v1 opt; | 789 | struct tc_sfq_qopt_v1 opt; |
790 | struct red_parms *p = q->red_parms; | ||
686 | 791 | ||
687 | memset(&opt, 0, sizeof(opt)); | 792 | memset(&opt, 0, sizeof(opt)); |
688 | opt.v0.quantum = q->quantum; | 793 | opt.v0.quantum = q->quantum; |
@@ -693,6 +798,17 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) | |||
693 | opt.depth = q->maxdepth; | 798 | opt.depth = q->maxdepth; |
694 | opt.headdrop = q->headdrop; | 799 | opt.headdrop = q->headdrop; |
695 | 800 | ||
801 | if (p) { | ||
802 | opt.qth_min = p->qth_min >> p->Wlog; | ||
803 | opt.qth_max = p->qth_max >> p->Wlog; | ||
804 | opt.Wlog = p->Wlog; | ||
805 | opt.Plog = p->Plog; | ||
806 | opt.Scell_log = p->Scell_log; | ||
807 | opt.max_P = p->max_P; | ||
808 | } | ||
809 | memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); | ||
810 | opt.flags = q->flags; | ||
811 | |||
696 | NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); | 812 | NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); |
697 | 813 | ||
698 | return skb->len; | 814 | return skb->len; |
@@ -747,15 +863,13 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, | |||
747 | sfq_index idx = q->ht[cl - 1]; | 863 | sfq_index idx = q->ht[cl - 1]; |
748 | struct gnet_stats_queue qs = { 0 }; | 864 | struct gnet_stats_queue qs = { 0 }; |
749 | struct tc_sfq_xstats xstats = { 0 }; | 865 | struct tc_sfq_xstats xstats = { 0 }; |
750 | struct sk_buff *skb; | ||
751 | 866 | ||
752 | if (idx != SFQ_EMPTY_SLOT) { | 867 | if (idx != SFQ_EMPTY_SLOT) { |
753 | const struct sfq_slot *slot = &q->slots[idx]; | 868 | const struct sfq_slot *slot = &q->slots[idx]; |
754 | 869 | ||
755 | xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; | 870 | xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; |
756 | qs.qlen = slot->qlen; | 871 | qs.qlen = slot->qlen; |
757 | slot_queue_walk(slot, skb) | 872 | qs.backlog = slot->backlog; |
758 | qs.backlog += qdisc_pkt_len(skb); | ||
759 | } | 873 | } |
760 | if (gnet_stats_copy_queue(d, &qs) < 0) | 874 | if (gnet_stats_copy_queue(d, &qs) < 0) |
761 | return -1; | 875 | return -1; |